[llvm] 9122c52 - [RISCV] Enable bidirectional scheduling and tracking register pressure (#115445)

via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 15 01:53:38 PST 2024


Author: Pengcheng Wang
Date: 2024-11-15T17:53:14+08:00
New Revision: 9122c5235ec85ce0c0ad337e862b006e7b349d84

URL: https://github.com/llvm/llvm-project/commit/9122c5235ec85ce0c0ad337e862b006e7b349d84
DIFF: https://github.com/llvm/llvm-project/commit/9122c5235ec85ce0c0ad337e862b006e7b349d84.diff

LOG: [RISCV] Enable bidirectional scheduling and tracking register pressure (#115445)


This is based on other targets like PPC/AArch64 and some experiments.

This PR will only enable bidirectional scheduling and tracking register
pressure.

Disclaimer: I haven't tested it on many cores, maybe we should make
some options being features. I believe downstreams must have tried
this before, so feedbacks are welcome.

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVSubtarget.cpp
    llvm/lib/Target/RISCV/RISCVSubtarget.h
    llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
    llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll
    llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll
    llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll
    llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll
    llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll
    llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
    llvm/test/CodeGen/RISCV/GlobalISel/rv32zbkb.ll
    llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll
    llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
    llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
    llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
    llvm/test/CodeGen/RISCV/abds-neg.ll
    llvm/test/CodeGen/RISCV/abds.ll
    llvm/test/CodeGen/RISCV/abdu-neg.ll
    llvm/test/CodeGen/RISCV/abdu.ll
    llvm/test/CodeGen/RISCV/add-before-shl.ll
    llvm/test/CodeGen/RISCV/add-imm.ll
    llvm/test/CodeGen/RISCV/addcarry.ll
    llvm/test/CodeGen/RISCV/addimm-mulimm.ll
    llvm/test/CodeGen/RISCV/alu16.ll
    llvm/test/CodeGen/RISCV/alu8.ll
    llvm/test/CodeGen/RISCV/and.ll
    llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
    llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
    llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll
    llvm/test/CodeGen/RISCV/atomic-rmw.ll
    llvm/test/CodeGen/RISCV/atomic-signext.ll
    llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
    llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
    llvm/test/CodeGen/RISCV/avgceils.ll
    llvm/test/CodeGen/RISCV/avgceilu.ll
    llvm/test/CodeGen/RISCV/avgfloors.ll
    llvm/test/CodeGen/RISCV/avgflooru.ll
    llvm/test/CodeGen/RISCV/bf16-promote.ll
    llvm/test/CodeGen/RISCV/bfloat-arith.ll
    llvm/test/CodeGen/RISCV/bfloat-br-fcmp.ll
    llvm/test/CodeGen/RISCV/bfloat-convert.ll
    llvm/test/CodeGen/RISCV/bfloat-fcmp.ll
    llvm/test/CodeGen/RISCV/bfloat-mem.ll
    llvm/test/CodeGen/RISCV/bfloat.ll
    llvm/test/CodeGen/RISCV/bitextract-mac.ll
    llvm/test/CodeGen/RISCV/bittest.ll
    llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
    llvm/test/CodeGen/RISCV/calling-conv-half.ll
    llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll
    llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
    llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll
    llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
    llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll
    llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
    llvm/test/CodeGen/RISCV/cmov-branch-opt.ll
    llvm/test/CodeGen/RISCV/compress.ll
    llvm/test/CodeGen/RISCV/condbinops.ll
    llvm/test/CodeGen/RISCV/condops.ll
    llvm/test/CodeGen/RISCV/copysign-casts.ll
    llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
    llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
    llvm/test/CodeGen/RISCV/div-by-constant.ll
    llvm/test/CodeGen/RISCV/div-pow2.ll
    llvm/test/CodeGen/RISCV/div.ll
    llvm/test/CodeGen/RISCV/double-arith.ll
    llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll
    llvm/test/CodeGen/RISCV/double-calling-conv.ll
    llvm/test/CodeGen/RISCV/double-convert.ll
    llvm/test/CodeGen/RISCV/double-imm.ll
    llvm/test/CodeGen/RISCV/double-intrinsics.ll
    llvm/test/CodeGen/RISCV/double-mem.ll
    llvm/test/CodeGen/RISCV/double-previous-failure.ll
    llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
    llvm/test/CodeGen/RISCV/double-select-fcmp.ll
    llvm/test/CodeGen/RISCV/double_reduct.ll
    llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
    llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll
    llvm/test/CodeGen/RISCV/float-arith.ll
    llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll
    llvm/test/CodeGen/RISCV/float-convert.ll
    llvm/test/CodeGen/RISCV/float-intrinsics.ll
    llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
    llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
    llvm/test/CodeGen/RISCV/fold-binop-into-select.ll
    llvm/test/CodeGen/RISCV/forced-atomics.ll
    llvm/test/CodeGen/RISCV/fp128.ll
    llvm/test/CodeGen/RISCV/fpclamptosat.ll
    llvm/test/CodeGen/RISCV/fpenv.ll
    llvm/test/CodeGen/RISCV/ghccc-rv32.ll
    llvm/test/CodeGen/RISCV/ghccc-rv64.ll
    llvm/test/CodeGen/RISCV/ghccc-without-f-reg.ll
    llvm/test/CodeGen/RISCV/global-merge.ll
    llvm/test/CodeGen/RISCV/half-arith-strict.ll
    llvm/test/CodeGen/RISCV/half-arith.ll
    llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll
    llvm/test/CodeGen/RISCV/half-br-fcmp.ll
    llvm/test/CodeGen/RISCV/half-convert.ll
    llvm/test/CodeGen/RISCV/half-fcmp-strict.ll
    llvm/test/CodeGen/RISCV/half-fcmp.ll
    llvm/test/CodeGen/RISCV/half-intrinsics.ll
    llvm/test/CodeGen/RISCV/half-mem.ll
    llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
    llvm/test/CodeGen/RISCV/half-select-fcmp.ll
    llvm/test/CodeGen/RISCV/iabs.ll
    llvm/test/CodeGen/RISCV/imm.ll
    llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll
    llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll
    llvm/test/CodeGen/RISCV/interrupt-attr-nocall.ll
    llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
    llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll
    llvm/test/CodeGen/RISCV/llvm.exp10.ll
    llvm/test/CodeGen/RISCV/llvm.frexp.ll
    llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll
    llvm/test/CodeGen/RISCV/lsr-legaladdimm.ll
    llvm/test/CodeGen/RISCV/machine-combiner.ll
    llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll
    llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll
    llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll
    llvm/test/CodeGen/RISCV/memcmp-optsize.ll
    llvm/test/CodeGen/RISCV/memcmp.ll
    llvm/test/CodeGen/RISCV/memcpy.ll
    llvm/test/CodeGen/RISCV/misched-mem-clustering.mir
    llvm/test/CodeGen/RISCV/mul.ll
    llvm/test/CodeGen/RISCV/neg-abs.ll
    llvm/test/CodeGen/RISCV/or-is-add.ll
    llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
    llvm/test/CodeGen/RISCV/pr51206.ll
    llvm/test/CodeGen/RISCV/pr56457.ll
    llvm/test/CodeGen/RISCV/pr58511.ll
    llvm/test/CodeGen/RISCV/pr65025.ll
    llvm/test/CodeGen/RISCV/pr68855.ll
    llvm/test/CodeGen/RISCV/pr69586.ll
    llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
    llvm/test/CodeGen/RISCV/pr95271.ll
    llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
    llvm/test/CodeGen/RISCV/rem.ll
    llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
    llvm/test/CodeGen/RISCV/riscv-shifted-extend.ll
    llvm/test/CodeGen/RISCV/rotl-rotr.ll
    llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
    llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll
    llvm/test/CodeGen/RISCV/rv32zbb.ll
    llvm/test/CodeGen/RISCV/rv32zbs.ll
    llvm/test/CodeGen/RISCV/rv64-double-convert.ll
    llvm/test/CodeGen/RISCV/rv64-float-convert.ll
    llvm/test/CodeGen/RISCV/rv64-half-convert.ll
    llvm/test/CodeGen/RISCV/rv64-trampoline.ll
    llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
    llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
    llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
    llvm/test/CodeGen/RISCV/rv64zba.ll
    llvm/test/CodeGen/RISCV/rv64zbb-intrinsic.ll
    llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll
    llvm/test/CodeGen/RISCV/rv64zbb.ll
    llvm/test/CodeGen/RISCV/rv64zbkb.ll
    llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll
    llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
    llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
    llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
    llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
    llvm/test/CodeGen/RISCV/rvv/alloca-load-store-vector-tuple.ll
    llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
    llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
    llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
    llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
    llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
    llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
    llvm/test/CodeGen/RISCV/rvv/compressstore.ll
    llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
    llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
    llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
    llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
    llvm/test/CodeGen/RISCV/rvv/dont-sink-splat-operands.ll
    llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
    llvm/test/CodeGen/RISCV/rvv/expandload.ll
    llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
    llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
    llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
    llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
    llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
    llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
    llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll
    llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll
    llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
    llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/frm-insert.ll
    llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
    llvm/test/CodeGen/RISCV/rvv/fshr-fshl.ll
    llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
    llvm/test/CodeGen/RISCV/rvv/insertelt-i1.ll
    llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll
    llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
    llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
    llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll
    llvm/test/CodeGen/RISCV/rvv/memset-inline.ll
    llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll
    llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/mutate-prior-vsetvli-avl.ll
    llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
    llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
    llvm/test/CodeGen/RISCV/rvv/pr104480.ll
    llvm/test/CodeGen/RISCV/rvv/pr52475.ll
    llvm/test/CodeGen/RISCV/rvv/pr61561.ll
    llvm/test/CodeGen/RISCV/rvv/pr88576.ll
    llvm/test/CodeGen/RISCV/rvv/pr95865.ll
    llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
    llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll
    llvm/test/CodeGen/RISCV/rvv/round-vp.ll
    llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
    llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
    llvm/test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll
    llvm/test/CodeGen/RISCV/rvv/rvv-framelayout.ll
    llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
    llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
    llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
    llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
    llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
    llvm/test/CodeGen/RISCV/rvv/splat-vector-split-i64-vl-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll
    llvm/test/CodeGen/RISCV/rvv/stepvector.ll
    llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
    llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
    llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll
    llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll
    llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
    llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
    llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
    llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
    llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
    llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
    llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
    llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
    llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vfcmp-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfcmps-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vfirst-byte-compare-index.ll
    llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vfmadd-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfnmsub-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll
    llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll
    llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
    llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask-fixed-vectors.ll
    llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask.ll
    llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-fixed-vectors.ll
    llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-vectors.ll
    llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vpload.ll
    llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vpstore.ll
    llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll
    llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll
    llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
    llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
    llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
    llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
    llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
    llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
    llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
    llvm/test/CodeGen/RISCV/sadd_sat.ll
    llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
    llvm/test/CodeGen/RISCV/select-binop-identity.ll
    llvm/test/CodeGen/RISCV/select-const.ll
    llvm/test/CodeGen/RISCV/select.ll
    llvm/test/CodeGen/RISCV/setcc-logic.ll
    llvm/test/CodeGen/RISCV/sextw-removal.ll
    llvm/test/CodeGen/RISCV/shift-amount-mod.ll
    llvm/test/CodeGen/RISCV/shift-and.ll
    llvm/test/CodeGen/RISCV/shifts.ll
    llvm/test/CodeGen/RISCV/shl-cttz.ll
    llvm/test/CodeGen/RISCV/shlimm-addimm.ll
    llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
    llvm/test/CodeGen/RISCV/signed-truncation-check.ll
    llvm/test/CodeGen/RISCV/split-offsets.ll
    llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
    llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
    llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
    llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
    llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
    llvm/test/CodeGen/RISCV/stack-store-check.ll
    llvm/test/CodeGen/RISCV/tail-calls.ll
    llvm/test/CodeGen/RISCV/trunc-nsw-nuw.ll
    llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
    llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
    llvm/test/CodeGen/RISCV/unaligned-load-store.ll
    llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
    llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
    llvm/test/CodeGen/RISCV/urem-lkk.ll
    llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
    llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
    llvm/test/CodeGen/RISCV/usub_sat_plus.ll
    llvm/test/CodeGen/RISCV/vararg-ilp32e.ll
    llvm/test/CodeGen/RISCV/vararg.ll
    llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
    llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
    llvm/test/CodeGen/RISCV/xaluo.ll
    llvm/test/CodeGen/RISCV/xtheadmac.ll
    llvm/test/CodeGen/RISCV/xtheadmemidx.ll
    llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index e7db1ededf383b..03397e1e0d89ee 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -16,6 +16,7 @@
 #include "RISCV.h"
 #include "RISCVFrameLowering.h"
 #include "RISCVTargetMachine.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/MacroFusion.h"
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -199,3 +200,15 @@ unsigned RISCVSubtarget::getMinimumJumpTableEntries() const {
              ? RISCVMinimumJumpTableEntries
              : TuneInfo->MinimumJumpTableEntries;
 }
+
+void RISCVSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                         unsigned NumRegionInstrs) const {
+  // Do bidirectional scheduling since it provides a more balanced scheduling
+  // leading to better performance. This will increase compile time.
+  Policy.OnlyTopDown = false;
+  Policy.OnlyBottomUp = false;
+
+  // Spilling is generally expensive on all RISC-V cores, so always enable
+  // register-pressure tracking. This will increase compile time.
+  Policy.ShouldTrackPressure = true;
+}

diff  --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index f59a3737ae76f9..f2c0a3d85c998a 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -327,6 +327,9 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   unsigned getTailDupAggressiveThreshold() const {
     return TuneInfo->TailDupAggressiveThreshold;
   }
+
+  void overrideSchedPolicy(MachineSchedPolicy &Policy,
+                           unsigned NumRegionInstrs) const override;
 };
 } // End llvm namespace
 

diff  --git a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
index 330f8b16065f13..ee414992a5245c 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll
@@ -25,8 +25,8 @@ define i32 @add_i8_signext_i32(i8 %a, i8 %b) {
 ; RV32IM-LABEL: add_i8_signext_i32:
 ; RV32IM:       # %bb.0: # %entry
 ; RV32IM-NEXT:    slli a0, a0, 24
-; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    slli a1, a1, 24
+; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    srai a1, a1, 24
 ; RV32IM-NEXT:    add a0, a0, a1
 ; RV32IM-NEXT:    ret
@@ -34,8 +34,8 @@ define i32 @add_i8_signext_i32(i8 %a, i8 %b) {
 ; RV64IM-LABEL: add_i8_signext_i32:
 ; RV64IM:       # %bb.0: # %entry
 ; RV64IM-NEXT:    slli a0, a0, 56
-; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    slli a1, a1, 56
+; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll b/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll
index f33ba1d7a302ef..bce6dfacf8e82c 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/bitmanip.ll
@@ -6,8 +6,8 @@ define i2 @bitreverse_i2(i2 %x) {
 ; RV32-LABEL: bitreverse_i2:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a1, a0, 1
-; RV32-NEXT:    andi a1, a1, 2
 ; RV32-NEXT:    andi a0, a0, 3
+; RV32-NEXT:    andi a1, a1, 2
 ; RV32-NEXT:    srli a0, a0, 1
 ; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    ret
@@ -15,8 +15,8 @@ define i2 @bitreverse_i2(i2 %x) {
 ; RV64-LABEL: bitreverse_i2:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    slli a1, a0, 1
-; RV64-NEXT:    andi a1, a1, 2
 ; RV64-NEXT:    andi a0, a0, 3
+; RV64-NEXT:    andi a1, a1, 2
 ; RV64-NEXT:    srli a0, a0, 1
 ; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    ret
@@ -28,8 +28,8 @@ define i3 @bitreverse_i3(i3 %x) {
 ; RV32-LABEL: bitreverse_i3:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a1, a0, 2
-; RV32-NEXT:    andi a1, a1, 4
 ; RV32-NEXT:    andi a0, a0, 7
+; RV32-NEXT:    andi a1, a1, 4
 ; RV32-NEXT:    andi a2, a0, 2
 ; RV32-NEXT:    or a1, a1, a2
 ; RV32-NEXT:    srli a0, a0, 2
@@ -39,8 +39,8 @@ define i3 @bitreverse_i3(i3 %x) {
 ; RV64-LABEL: bitreverse_i3:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    slli a1, a0, 2
-; RV64-NEXT:    andi a1, a1, 4
 ; RV64-NEXT:    andi a0, a0, 7
+; RV64-NEXT:    andi a1, a1, 4
 ; RV64-NEXT:    andi a2, a0, 2
 ; RV64-NEXT:    or a1, a1, a2
 ; RV64-NEXT:    srli a0, a0, 2
@@ -54,11 +54,11 @@ define i4 @bitreverse_i4(i4 %x) {
 ; RV32-LABEL: bitreverse_i4:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a1, a0, 3
-; RV32-NEXT:    andi a1, a1, 8
 ; RV32-NEXT:    slli a2, a0, 1
+; RV32-NEXT:    andi a0, a0, 15
+; RV32-NEXT:    andi a1, a1, 8
 ; RV32-NEXT:    andi a2, a2, 4
 ; RV32-NEXT:    or a1, a1, a2
-; RV32-NEXT:    andi a0, a0, 15
 ; RV32-NEXT:    srli a2, a0, 1
 ; RV32-NEXT:    andi a2, a2, 2
 ; RV32-NEXT:    or a1, a1, a2
@@ -69,11 +69,11 @@ define i4 @bitreverse_i4(i4 %x) {
 ; RV64-LABEL: bitreverse_i4:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    slli a1, a0, 3
-; RV64-NEXT:    andi a1, a1, 8
 ; RV64-NEXT:    slli a2, a0, 1
+; RV64-NEXT:    andi a0, a0, 15
+; RV64-NEXT:    andi a1, a1, 8
 ; RV64-NEXT:    andi a2, a2, 4
 ; RV64-NEXT:    or a1, a1, a2
-; RV64-NEXT:    andi a0, a0, 15
 ; RV64-NEXT:    srli a2, a0, 1
 ; RV64-NEXT:    andi a2, a2, 2
 ; RV64-NEXT:    or a1, a1, a2
@@ -88,21 +88,21 @@ define i7 @bitreverse_i7(i7 %x) {
 ; RV32-LABEL: bitreverse_i7:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a1, a0, 6
-; RV32-NEXT:    andi a1, a1, 64
 ; RV32-NEXT:    slli a2, a0, 4
+; RV32-NEXT:    slli a3, a0, 2
+; RV32-NEXT:    andi a0, a0, 127
+; RV32-NEXT:    andi a1, a1, 64
 ; RV32-NEXT:    andi a2, a2, 32
+; RV32-NEXT:    andi a3, a3, 16
 ; RV32-NEXT:    or a1, a1, a2
-; RV32-NEXT:    slli a2, a0, 2
-; RV32-NEXT:    andi a2, a2, 16
-; RV32-NEXT:    andi a0, a0, 127
-; RV32-NEXT:    andi a3, a0, 8
-; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    andi a2, a0, 8
+; RV32-NEXT:    or a2, a3, a2
+; RV32-NEXT:    srli a3, a0, 2
 ; RV32-NEXT:    or a1, a1, a2
-; RV32-NEXT:    srli a2, a0, 2
-; RV32-NEXT:    andi a2, a2, 4
-; RV32-NEXT:    srli a3, a0, 4
-; RV32-NEXT:    andi a3, a3, 2
-; RV32-NEXT:    or a2, a2, a3
+; RV32-NEXT:    srli a2, a0, 4
+; RV32-NEXT:    andi a3, a3, 4
+; RV32-NEXT:    andi a2, a2, 2
+; RV32-NEXT:    or a2, a3, a2
 ; RV32-NEXT:    or a1, a1, a2
 ; RV32-NEXT:    srli a0, a0, 6
 ; RV32-NEXT:    or a0, a1, a0
@@ -111,21 +111,21 @@ define i7 @bitreverse_i7(i7 %x) {
 ; RV64-LABEL: bitreverse_i7:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    slli a1, a0, 6
-; RV64-NEXT:    andi a1, a1, 64
 ; RV64-NEXT:    slli a2, a0, 4
+; RV64-NEXT:    slli a3, a0, 2
+; RV64-NEXT:    andi a0, a0, 127
+; RV64-NEXT:    andi a1, a1, 64
 ; RV64-NEXT:    andi a2, a2, 32
+; RV64-NEXT:    andi a3, a3, 16
 ; RV64-NEXT:    or a1, a1, a2
-; RV64-NEXT:    slli a2, a0, 2
-; RV64-NEXT:    andi a2, a2, 16
-; RV64-NEXT:    andi a0, a0, 127
-; RV64-NEXT:    andi a3, a0, 8
-; RV64-NEXT:    or a2, a2, a3
+; RV64-NEXT:    andi a2, a0, 8
+; RV64-NEXT:    or a2, a3, a2
+; RV64-NEXT:    srli a3, a0, 2
 ; RV64-NEXT:    or a1, a1, a2
-; RV64-NEXT:    srli a2, a0, 2
-; RV64-NEXT:    andi a2, a2, 4
-; RV64-NEXT:    srli a3, a0, 4
-; RV64-NEXT:    andi a3, a3, 2
-; RV64-NEXT:    or a2, a2, a3
+; RV64-NEXT:    srli a2, a0, 4
+; RV64-NEXT:    andi a3, a3, 4
+; RV64-NEXT:    andi a2, a2, 2
+; RV64-NEXT:    or a2, a3, a2
 ; RV64-NEXT:    or a1, a1, a2
 ; RV64-NEXT:    srli a0, a0, 6
 ; RV64-NEXT:    or a0, a1, a0
@@ -139,33 +139,33 @@ define i24 @bitreverse_i24(i24 %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a1, a0, 16
 ; RV32-NEXT:    lui a2, 4096
+; RV32-NEXT:    lui a3, 1048335
 ; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    addi a3, a3, 240
 ; RV32-NEXT:    and a0, a0, a2
 ; RV32-NEXT:    srli a0, a0, 16
 ; RV32-NEXT:    or a0, a0, a1
-; RV32-NEXT:    lui a1, 1048335
-; RV32-NEXT:    addi a1, a1, 240
-; RV32-NEXT:    and a3, a1, a2
-; RV32-NEXT:    and a3, a0, a3
-; RV32-NEXT:    srli a3, a3, 4
+; RV32-NEXT:    and a1, a3, a2
+; RV32-NEXT:    and a1, a0, a1
 ; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    or a0, a3, a0
-; RV32-NEXT:    lui a1, 1047757
-; RV32-NEXT:    addi a1, a1, -820
-; RV32-NEXT:    and a3, a1, a2
-; RV32-NEXT:    and a3, a0, a3
-; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    lui a3, 1047757
+; RV32-NEXT:    addi a3, a3, -820
+; RV32-NEXT:    srli a1, a1, 4
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    and a1, a3, a2
+; RV32-NEXT:    and a1, a0, a1
 ; RV32-NEXT:    slli a0, a0, 2
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    or a0, a3, a0
-; RV32-NEXT:    lui a1, 1047211
-; RV32-NEXT:    addi a1, a1, -1366
-; RV32-NEXT:    and a2, a1, a2
+; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    lui a3, 1047211
+; RV32-NEXT:    addi a3, a3, -1366
+; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    and a2, a0, a2
-; RV32-NEXT:    srli a2, a2, 1
 ; RV32-NEXT:    slli a0, a0, 1
-; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    srli a2, a2, 1
+; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    or a0, a2, a0
 ; RV32-NEXT:    ret
 ;
@@ -173,33 +173,33 @@ define i24 @bitreverse_i24(i24 %x) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    slli a1, a0, 16
 ; RV64-NEXT:    lui a2, 4096
+; RV64-NEXT:    lui a3, 1048335
 ; RV64-NEXT:    addiw a2, a2, -1
+; RV64-NEXT:    addiw a3, a3, 240
 ; RV64-NEXT:    and a0, a0, a2
 ; RV64-NEXT:    srli a0, a0, 16
 ; RV64-NEXT:    or a0, a0, a1
-; RV64-NEXT:    lui a1, 1048335
-; RV64-NEXT:    addiw a1, a1, 240
-; RV64-NEXT:    and a3, a1, a2
-; RV64-NEXT:    and a3, a0, a3
-; RV64-NEXT:    srli a3, a3, 4
+; RV64-NEXT:    and a1, a3, a2
+; RV64-NEXT:    and a1, a0, a1
 ; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    or a0, a3, a0
-; RV64-NEXT:    lui a1, 1047757
-; RV64-NEXT:    addiw a1, a1, -820
-; RV64-NEXT:    and a3, a1, a2
-; RV64-NEXT:    and a3, a0, a3
-; RV64-NEXT:    srli a3, a3, 2
+; RV64-NEXT:    and a0, a0, a3
+; RV64-NEXT:    lui a3, 1047757
+; RV64-NEXT:    addiw a3, a3, -820
+; RV64-NEXT:    srli a1, a1, 4
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    and a1, a3, a2
+; RV64-NEXT:    and a1, a0, a1
 ; RV64-NEXT:    slli a0, a0, 2
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    or a0, a3, a0
-; RV64-NEXT:    lui a1, 1047211
-; RV64-NEXT:    addiw a1, a1, -1366
-; RV64-NEXT:    and a2, a1, a2
+; RV64-NEXT:    and a0, a0, a3
+; RV64-NEXT:    lui a3, 1047211
+; RV64-NEXT:    addiw a3, a3, -1366
+; RV64-NEXT:    and a2, a3, a2
+; RV64-NEXT:    srli a1, a1, 2
+; RV64-NEXT:    or a0, a1, a0
 ; RV64-NEXT:    and a2, a0, a2
-; RV64-NEXT:    srli a2, a2, 1
 ; RV64-NEXT:    slli a0, a0, 1
-; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    srli a2, a2, 1
+; RV64-NEXT:    and a0, a0, a3
 ; RV64-NEXT:    or a0, a2, a0
 ; RV64-NEXT:    ret
   %rev = call i24 @llvm.bitreverse.i24(i24 %x)

diff  --git a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll
index 70d1b25309c844..cf7cef83bcc135 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll
@@ -21,34 +21,34 @@ define void @constant_fold_barrier_i128(ptr %p) {
 ; RV32-LABEL: constant_fold_barrier_i128:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    slli a1, a1, 11
 ; RV32-NEXT:    lw a2, 0(a0)
 ; RV32-NEXT:    lw a3, 4(a0)
 ; RV32-NEXT:    lw a4, 8(a0)
 ; RV32-NEXT:    lw a5, 12(a0)
+; RV32-NEXT:    slli a1, a1, 11
 ; RV32-NEXT:    and a2, a2, a1
 ; RV32-NEXT:    and a3, a3, zero
 ; RV32-NEXT:    and a4, a4, zero
 ; RV32-NEXT:    and a5, a5, zero
 ; RV32-NEXT:    add a2, a2, a1
-; RV32-NEXT:    sltu a1, a2, a1
 ; RV32-NEXT:    add a6, a3, zero
+; RV32-NEXT:    sltu a1, a2, a1
 ; RV32-NEXT:    sltu a3, a6, a3
 ; RV32-NEXT:    add a6, a6, a1
 ; RV32-NEXT:    seqz a7, a6
 ; RV32-NEXT:    and a1, a7, a1
+; RV32-NEXT:    add a7, a4, zero
+; RV32-NEXT:    add a5, a5, zero
+; RV32-NEXT:    sltu a4, a7, a4
 ; RV32-NEXT:    or a1, a3, a1
-; RV32-NEXT:    add a3, a4, zero
-; RV32-NEXT:    sltu a4, a3, a4
-; RV32-NEXT:    add a3, a3, a1
-; RV32-NEXT:    seqz a7, a3
-; RV32-NEXT:    and a1, a7, a1
+; RV32-NEXT:    add a7, a7, a1
+; RV32-NEXT:    seqz a3, a7
+; RV32-NEXT:    and a1, a3, a1
 ; RV32-NEXT:    or a1, a4, a1
-; RV32-NEXT:    add a5, a5, zero
 ; RV32-NEXT:    add a1, a5, a1
 ; RV32-NEXT:    sw a2, 0(a0)
 ; RV32-NEXT:    sw a6, 4(a0)
-; RV32-NEXT:    sw a3, 8(a0)
+; RV32-NEXT:    sw a7, 8(a0)
 ; RV32-NEXT:    sw a1, 12(a0)
 ; RV32-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll
index 51e8b6da39d099..2c3e3faddc3916 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll
@@ -21,9 +21,9 @@ define i128 @constant_fold_barrier_i128(i128 %x) {
 ; RV64-LABEL: constant_fold_barrier_i128:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    li a2, 1
+; RV64-NEXT:    and a1, a1, zero
 ; RV64-NEXT:    slli a2, a2, 11
 ; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    and a1, a1, zero
 ; RV64-NEXT:    add a0, a0, a2
 ; RV64-NEXT:    sltu a2, a0, a2
 ; RV64-NEXT:    add a1, a1, zero

diff  --git a/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll b/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll
index 05989c310541b8..1156edffe91943 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll
@@ -117,8 +117,8 @@ define i64 @abs64(i64 %x) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srai a2, a1, 31
 ; RV32I-NEXT:    add a0, a0, a2
-; RV32I-NEXT:    sltu a3, a0, a2
 ; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    sltu a3, a0, a2
 ; RV32I-NEXT:    add a1, a1, a3
 ; RV32I-NEXT:    xor a0, a0, a2
 ; RV32I-NEXT:    xor a1, a1, a2
@@ -128,8 +128,8 @@ define i64 @abs64(i64 %x) {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    srai a2, a1, 31
 ; RV32ZBB-NEXT:    add a0, a0, a2
-; RV32ZBB-NEXT:    sltu a3, a0, a2
 ; RV32ZBB-NEXT:    add a1, a1, a2
+; RV32ZBB-NEXT:    sltu a3, a0, a2
 ; RV32ZBB-NEXT:    add a1, a1, a3
 ; RV32ZBB-NEXT:    xor a0, a0, a2
 ; RV32ZBB-NEXT:    xor a1, a1, a2

diff  --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll
index c558639fda424e..68bf9240ccd1df 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll
@@ -302,8 +302,8 @@ define i64 @rori_i64(i64 %a) nounwind {
 ; CHECK-NEXT:    slli a2, a0, 31
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    slli a3, a1, 31
-; CHECK-NEXT:    or a0, a0, a3
 ; CHECK-NEXT:    srli a1, a1, 1
+; CHECK-NEXT:    or a0, a0, a3
 ; CHECK-NEXT:    or a1, a2, a1
 ; CHECK-NEXT:    ret
   %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 63)

diff  --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
index 1184905c17edea..7f22127ad3536c 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll
@@ -12,31 +12,31 @@ define i32 @ctlz_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    beqz a0, .LBB0_2
 ; RV32I-NEXT:  # %bb.1: # %cond.false
 ; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 4
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 1
 ; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 2
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 16
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
 ; RV32I-NEXT:    lui a2, 209715
 ; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    addi a2, a2, -241
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a1, a0
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 16
@@ -63,11 +63,11 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV32I-LABEL: ctlz_i64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    lui a6, 61681
 ; RV32I-NEXT:    addi a5, a2, 1365
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a4, a2, 819
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a3, a2, -241
+; RV32I-NEXT:    addi a4, a3, 819
+; RV32I-NEXT:    addi a3, a6, -241
 ; RV32I-NEXT:    li a2, 32
 ; RV32I-NEXT:    beqz a1, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
@@ -155,22 +155,22 @@ define i32 @cttz_i32(i32 %a) nounwind {
 ; RV32I-NEXT:  # %bb.1: # %cond.false
 ; RV32I-NEXT:    not a1, a0
 ; RV32I-NEXT:    addi a0, a0, -1
-; RV32I-NEXT:    and a0, a1, a0
-; RV32I-NEXT:    srli a1, a0, 1
 ; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 2
+; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
 ; RV32I-NEXT:    lui a2, 209715
 ; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 61681
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a1, a0
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    addi a1, a2, -241
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
@@ -196,11 +196,11 @@ define i64 @cttz_i64(i64 %a) nounwind {
 ; RV32I-LABEL: cttz_i64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    lui a5, 61681
 ; RV32I-NEXT:    addi a4, a2, 1365
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a3, a2, 819
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    addi a3, a3, 819
+; RV32I-NEXT:    addi a2, a5, -241
 ; RV32I-NEXT:    beqz a0, .LBB3_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    not a1, a0
@@ -271,17 +271,17 @@ define i32 @ctpop_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    addi a2, a2, 1365
 ; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 2
 ; RV32I-NEXT:    lui a2, 209715
 ; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 61681
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a1, a0
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    addi a1, a2, -241
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
@@ -305,39 +305,39 @@ define i64 @ctpop_i64(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 1
 ; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a1, 1
 ; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    and a3, a5, a3
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a5, a5, -241
 ; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a2, a0, 2
-; RV32I-NEXT:    lui a4, 209715
-; RV32I-NEXT:    addi a4, a4, 819
-; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    srli a2, a0, 4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    slli a5, a0, 8
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    slli a5, a0, 16
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    srli a5, a1, 1
-; RV32I-NEXT:    and a3, a5, a3
-; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a3, a1, 2
-; RV32I-NEXT:    and a3, a3, a4
 ; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    srli a2, a0, 4
 ; RV32I-NEXT:    srli a3, a1, 4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 8
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 16
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    and a0, a0, a5
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a1, 8
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    slli a3, a1, 16
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    li a1, 0
@@ -364,39 +364,39 @@ define i1 @ctpop_i64_ugt_two(i64 %a) nounwind {
 ; RV32I-NEXT:  .LBB6_2:
 ; RV32I-NEXT:    srli a2, a0, 1
 ; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a1, 1
 ; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    and a3, a5, a3
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a5, a5, -241
 ; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a2, a0, 2
-; RV32I-NEXT:    lui a4, 209715
-; RV32I-NEXT:    addi a4, a4, 819
-; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    srli a2, a0, 4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    slli a5, a0, 8
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    slli a5, a0, 16
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    srli a5, a1, 1
-; RV32I-NEXT:    and a3, a5, a3
-; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a3, a1, 2
-; RV32I-NEXT:    and a3, a3, a4
 ; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    srli a2, a0, 4
 ; RV32I-NEXT:    srli a3, a1, 4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 8
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 16
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    and a0, a0, a5
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a1, 8
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    slli a3, a1, 16
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    sltiu a0, a0, 2
@@ -429,39 +429,39 @@ define i1 @ctpop_i64_ugt_one(i64 %a) nounwind {
 ; RV32I-NEXT:  .LBB7_2:
 ; RV32I-NEXT:    srli a2, a0, 1
 ; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a1, 1
 ; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    and a3, a5, a3
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a5, a5, -241
 ; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a2, a0, 2
-; RV32I-NEXT:    lui a4, 209715
-; RV32I-NEXT:    addi a4, a4, 819
-; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    srli a2, a0, 4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    slli a5, a0, 8
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    slli a5, a0, 16
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    srli a5, a1, 1
-; RV32I-NEXT:    and a3, a5, a3
-; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a3, a1, 2
-; RV32I-NEXT:    and a3, a3, a4
 ; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    srli a2, a0, 4
 ; RV32I-NEXT:    srli a3, a1, 4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 8
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 16
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    and a0, a0, a5
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a1, 8
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    slli a3, a1, 16
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    sltiu a0, a0, 2
@@ -491,39 +491,39 @@ define i1 @ctpop_i64_eq_one(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 1
 ; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a1, 1
 ; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    and a3, a5, a3
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a5, a5, -241
 ; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a2, a0, 2
-; RV32I-NEXT:    lui a4, 209715
-; RV32I-NEXT:    addi a4, a4, 819
-; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    srli a2, a0, 4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    slli a5, a0, 8
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    slli a5, a0, 16
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    srli a5, a1, 1
-; RV32I-NEXT:    and a3, a5, a3
-; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a3, a1, 2
-; RV32I-NEXT:    and a3, a3, a4
 ; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    srli a2, a0, 4
 ; RV32I-NEXT:    srli a3, a1, 4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 8
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 16
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    and a0, a0, a5
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a1, 8
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    slli a3, a1, 16
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    xori a0, a0, 1
@@ -548,39 +548,39 @@ define i1 @ctpop_i64_ne_one(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 1
 ; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a1, 1
 ; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    and a3, a5, a3
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a5, a5, -241
 ; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a2, a0, 2
-; RV32I-NEXT:    lui a4, 209715
-; RV32I-NEXT:    addi a4, a4, 819
-; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    srli a2, a0, 4
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    slli a5, a0, 8
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    slli a5, a0, 16
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    srli a5, a1, 1
-; RV32I-NEXT:    and a3, a5, a3
-; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    srli a3, a1, 2
-; RV32I-NEXT:    and a3, a3, a4
 ; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    and a3, a3, a4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    srli a2, a0, 4
 ; RV32I-NEXT:    srli a3, a1, 4
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 8
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 16
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    and a0, a0, a5
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a1, 8
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    slli a3, a1, 16
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    xori a0, a0, 1
@@ -872,8 +872,8 @@ define i64 @abs_i64(i64 %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    srai a2, a1, 31
 ; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    xor a0, a0, a2
 ; CHECK-NEXT:    xor a1, a1, a2
@@ -923,15 +923,15 @@ define i32 @bswap_i32(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a0, 24
 ; RV32I-NEXT:    srli a2, a0, 24
+; RV32I-NEXT:    lui a3, 16
 ; RV32I-NEXT:    or a1, a2, a1
-; RV32I-NEXT:    lui a2, 16
-; RV32I-NEXT:    addi a2, a2, -256
-; RV32I-NEXT:    and a3, a0, a2
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    addi a3, a3, -256
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    slli a0, a0, 8
+; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: bswap_i32:
@@ -949,25 +949,24 @@ define i64 @bswap_i64(i64 %a) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a1, 24
 ; RV32I-NEXT:    srli a3, a1, 24
+; RV32I-NEXT:    lui a4, 16
+; RV32I-NEXT:    srli a5, a1, 8
+; RV32I-NEXT:    slli a6, a0, 24
 ; RV32I-NEXT:    or a2, a3, a2
-; RV32I-NEXT:    lui a3, 16
-; RV32I-NEXT:    addi a3, a3, -256
-; RV32I-NEXT:    and a4, a1, a3
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    srli a1, a1, 8
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    or a1, a2, a1
-; RV32I-NEXT:    or a2, a1, a4
-; RV32I-NEXT:    slli a1, a0, 24
-; RV32I-NEXT:    srli a4, a0, 24
-; RV32I-NEXT:    or a1, a4, a1
-; RV32I-NEXT:    and a4, a0, a3
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    and a0, a0, a3
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    or a1, a0, a4
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    srli a3, a0, 24
+; RV32I-NEXT:    or a3, a3, a6
+; RV32I-NEXT:    srli a6, a0, 8
+; RV32I-NEXT:    addi a4, a4, -256
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a5, a5, a4
+; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    and a4, a6, a4
+; RV32I-NEXT:    or a2, a2, a5
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    slli a5, a0, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    or a0, a2, a1
+; RV32I-NEXT:    or a1, a3, a5
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: bswap_i64:

diff  --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbkb.ll
index 80e43c94aab0e6..a647eae82dddfc 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbkb.ll
@@ -111,8 +111,8 @@ define i32 @packh_i32(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: packh_i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    lui a2, 16
-; CHECK-NEXT:    addi a2, a2, -256
 ; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    addi a2, a2, -256
 ; CHECK-NEXT:    slli a1, a1, 8
 ; CHECK-NEXT:    and a1, a1, a2
 ; CHECK-NEXT:    or a0, a1, a0
@@ -149,8 +149,8 @@ define i64 @packh_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: packh_i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    lui a1, 16
-; CHECK-NEXT:    addi a1, a1, -256
 ; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    addi a1, a1, -256
 ; CHECK-NEXT:    slli a2, a2, 8
 ; CHECK-NEXT:    and a1, a2, a1
 ; CHECK-NEXT:    or a0, a1, a0

diff  --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll
index 2bd0c78659b004..9584270d8e66f5 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zba.ll
@@ -1025,8 +1025,8 @@ define i64 @pack_i64_2(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: pack_i64_2:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a0, a1, a0
@@ -1337,8 +1337,8 @@ define i64 @array_index_lshr_sh3_sh3(ptr %p, i64 %idx1, i64 %idx2) {
 ; RV64I-LABEL: array_index_lshr_sh3_sh3:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a1, 58
-; RV64I-NEXT:    slli a1, a1, 6
 ; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    slli a1, a1, 6
 ; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ld a0, 0(a0)

diff  --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
index 03f8eff90c23bd..961811d3b623c7 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll
@@ -17,32 +17,32 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    srliw a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
@@ -75,32 +75,32 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    beqz a1, .LBB1_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
@@ -142,32 +142,32 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    beqz a2, .LBB2_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
@@ -202,35 +202,35 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    li s0, -1
 ; RV64I-NEXT:    slli a1, a0, 32
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    lui a3, 349525
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    addiw a2, a3, 1365
+; RV64I-NEXT:    srliw a3, a0, 2
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srliw a3, a0, 4
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srliw a3, a0, 8
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srliw a3, a0, 16
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srliw a3, a0, 1
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    addiw a3, a3, 819
+; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    lui a2, 4112
 ; RV64I-NEXT:    srli s1, a1, 32
-; RV64I-NEXT:    srliw a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    addiw a1, a3, -241
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    addiw a1, a2, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    beqz s1, .LBB3_2
 ; RV64I-NEXT:  # %bb.1:
@@ -276,32 +276,32 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    .cfi_offset ra, -8
 ; RV64I-NEXT:    srliw a0, a0, 2
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srli a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
@@ -334,29 +334,27 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    beqz a0, .LBB5_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 32
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    lui a2, 21845
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addi a1, a2, 1365
+; RV64I-NEXT:    srli a2, a0, 2
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    addi a1, a1, 1365
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    addi a1, a1, 1365
+; RV64I-NEXT:    srli a2, a0, 8
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    addi a1, a1, 1365
+; RV64I-NEXT:    srli a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srli a2, a0, 32
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 13107
 ; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    slli a2, a2, 12
@@ -365,20 +363,22 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    slli a2, a2, 12
 ; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 3855
+; RV64I-NEXT:    addi a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -241
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 3855
-; RV64I-NEXT:    addi a1, a1, 241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, -241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, 241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 16
@@ -414,24 +414,24 @@ define signext i32 @cttz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    not a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a1, a0
-; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
@@ -457,24 +457,24 @@ define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    not a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a1, a0
-; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
@@ -501,24 +501,24 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    li s0, -1
 ; RV64I-NEXT:    not a0, a0
 ; RV64I-NEXT:    addi a1, s1, -1
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    slli s1, s1, 32
@@ -560,35 +560,35 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    not a0, a0
 ; RV64I-NEXT:    addi a1, s0, -1
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    slli s0, s0, 32
 ; RV64I-NEXT:    srli s0, s0, 32
-; RV64I-NEXT:    mv a1, a0
-; RV64I-NEXT:    li a0, 0
+; RV64I-NEXT:    li a1, 0
 ; RV64I-NEXT:    beqz s0, .LBB9_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    srliw a0, a1, 24
-; RV64I-NEXT:    addiw a0, a0, 1
+; RV64I-NEXT:    srliw a1, a0, 24
+; RV64I-NEXT:    addiw a1, a1, 1
 ; RV64I-NEXT:  .LBB9_2:
+; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
@@ -622,19 +622,17 @@ define i64 @cttz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    not a1, a0
 ; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    and a0, a1, a0
-; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    lui a2, 21845
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    slli a2, a2, 12
-; RV64I-NEXT:    addi a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    addi a1, a2, 1365
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 1365
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 1365
+; RV64I-NEXT:    slli a1, a1, 12
+; RV64I-NEXT:    addi a1, a1, 1365
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 13107
 ; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    slli a2, a2, 12
@@ -643,19 +641,21 @@ define i64 @cttz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    slli a2, a2, 12
 ; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 3855
+; RV64I-NEXT:    addi a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 3855
-; RV64I-NEXT:    addi a1, a1, 241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, -241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, 241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -688,19 +688,19 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
@@ -725,19 +725,19 @@ define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
@@ -762,23 +762,23 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lwu a0, 0(a0)
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
 ; RV64I-NEXT:    lui a2, 209715
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 61681
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    addiw a1, a1, 257
 ; RV64I-NEXT:    call __muldi3
 ; RV64I-NEXT:    srliw a0, a0, 24
@@ -811,8 +811,6 @@ define i64 @ctpop_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    slli a2, a2, 12
 ; RV64I-NEXT:    addi a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 2
 ; RV64I-NEXT:    lui a2, 13107
 ; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    slli a2, a2, 12
@@ -821,19 +819,21 @@ define i64 @ctpop_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    addi a2, a2, 819
 ; RV64I-NEXT:    slli a2, a2, 12
 ; RV64I-NEXT:    addi a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 3855
+; RV64I-NEXT:    addi a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, -241
+; RV64I-NEXT:    slli a2, a2, 12
+; RV64I-NEXT:    addi a2, a2, 241
+; RV64I-NEXT:    slli a2, a2, 12
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lui a1, 3855
-; RV64I-NEXT:    addi a1, a1, 241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, -241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, 241
-; RV64I-NEXT:    slli a1, a1, 12
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -998,8 +998,8 @@ define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: minu_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a0, 32
-; RV64I-NEXT:    srli a2, a2, 32
 ; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    srli a2, a2, 32
 ; RV64I-NEXT:    srli a3, a3, 32
 ; RV64I-NEXT:    bltu a2, a3, .LBB23_2
 ; RV64I-NEXT:  # %bb.1:
@@ -1011,8 +1011,8 @@ define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64ZBB-LABEL: minu_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a0, a0, 32
-; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    slli a1, a1, 32
+; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    minu a0, a0, a1
 ; RV64ZBB-NEXT:    sext.w a0, a0
@@ -1046,8 +1046,8 @@ define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: maxu_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a0, 32
-; RV64I-NEXT:    srli a2, a2, 32
 ; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    srli a2, a2, 32
 ; RV64I-NEXT:    srli a3, a3, 32
 ; RV64I-NEXT:    bltu a3, a2, .LBB25_2
 ; RV64I-NEXT:  # %bb.1:
@@ -1059,8 +1059,8 @@ define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64ZBB-LABEL: maxu_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a0, a0, 32
-; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    slli a1, a1, 32
+; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
 ; RV64ZBB-NEXT:    sext.w a0, a0
@@ -1186,15 +1186,15 @@ define signext i32 @bswap_i32(i32 signext %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slliw a1, a0, 24
 ; RV64I-NEXT:    srliw a2, a0, 24
+; RV64I-NEXT:    lui a3, 16
 ; RV64I-NEXT:    or a1, a2, a1
-; RV64I-NEXT:    lui a2, 16
-; RV64I-NEXT:    addiw a2, a2, -256
-; RV64I-NEXT:    and a3, a0, a2
-; RV64I-NEXT:    slliw a3, a3, 8
-; RV64I-NEXT:    srliw a0, a0, 8
-; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    addiw a3, a3, -256
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    slliw a0, a0, 8
+; RV64I-NEXT:    or a1, a1, a2
 ; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: bswap_i32:
@@ -1213,15 +1213,15 @@ define void @bswap_i32_nosext(i32 signext %a, ptr %x) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a0, 24
 ; RV64I-NEXT:    srliw a3, a0, 24
+; RV64I-NEXT:    lui a4, 16
 ; RV64I-NEXT:    or a2, a3, a2
-; RV64I-NEXT:    lui a3, 16
-; RV64I-NEXT:    addi a3, a3, -256
-; RV64I-NEXT:    and a4, a0, a3
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    srliw a0, a0, 8
-; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    srliw a3, a0, 8
+; RV64I-NEXT:    addi a4, a4, -256
+; RV64I-NEXT:    and a0, a0, a4
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    or a2, a2, a3
 ; RV64I-NEXT:    or a0, a2, a0
-; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    sw a0, 0(a1)
 ; RV64I-NEXT:    ret
 ;
@@ -1243,29 +1243,29 @@ define i64 @bswap_i64(i64 %a) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a0, 56
 ; RV64I-NEXT:    srli a2, a0, 56
-; RV64I-NEXT:    or a1, a2, a1
-; RV64I-NEXT:    lui a2, 16
-; RV64I-NEXT:    addiw a2, a2, -256
-; RV64I-NEXT:    and a3, a0, a2
-; RV64I-NEXT:    slli a3, a3, 40
+; RV64I-NEXT:    lui a3, 16
 ; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    and a2, a4, a2
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    lui a2, 4080
-; RV64I-NEXT:    and a3, a0, a2
-; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    addiw a3, a3, -256
+; RV64I-NEXT:    and a4, a4, a3
+; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    srli a4, a0, 24
-; RV64I-NEXT:    and a2, a4, a2
-; RV64I-NEXT:    or a2, a3, a2
-; RV64I-NEXT:    lui a3, 1044480
-; RV64I-NEXT:    and a4, a0, a3
-; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    and a4, a4, a2
+; RV64I-NEXT:    and a2, a0, a2
+; RV64I-NEXT:    slli a2, a2, 24
 ; RV64I-NEXT:    or a2, a2, a4
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srli a0, a0, 8
-; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    lui a4, 1044480
+; RV64I-NEXT:    and a3, a0, a3
+; RV64I-NEXT:    slli a3, a3, 40
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    and a0, a0, a4
+; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: bswap_i64:

diff  --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
index 5cf2619a476bc0..338476a1bec83f 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll
@@ -98,8 +98,8 @@ define i64 @pack_i64_2(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: pack_i64_2:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a0, a1, a0
@@ -147,8 +147,8 @@ define signext i32 @packh_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: packh_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 16
-; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    andi a0, a0, 255
+; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    slli a1, a1, 8
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -157,8 +157,8 @@ define signext i32 @packh_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64ZBKB-LABEL: packh_i32:
 ; RV64ZBKB:       # %bb.0:
 ; RV64ZBKB-NEXT:    lui a2, 16
-; RV64ZBKB-NEXT:    addiw a2, a2, -256
 ; RV64ZBKB-NEXT:    andi a0, a0, 255
+; RV64ZBKB-NEXT:    addiw a2, a2, -256
 ; RV64ZBKB-NEXT:    slli a1, a1, 8
 ; RV64ZBKB-NEXT:    and a1, a1, a2
 ; RV64ZBKB-NEXT:    or a0, a1, a0
@@ -195,8 +195,8 @@ define i64 @packh_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: packh_i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 16
-; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    andi a0, a0, 255
+; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    slli a1, a1, 8
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    or a0, a1, a0
@@ -205,8 +205,8 @@ define i64 @packh_i64(i64 %a, i64 %b) nounwind {
 ; RV64ZBKB-LABEL: packh_i64:
 ; RV64ZBKB:       # %bb.0:
 ; RV64ZBKB-NEXT:    lui a2, 16
-; RV64ZBKB-NEXT:    addiw a2, a2, -256
 ; RV64ZBKB-NEXT:    andi a0, a0, 255
+; RV64ZBKB-NEXT:    addiw a2, a2, -256
 ; RV64ZBKB-NEXT:    slli a1, a1, 8
 ; RV64ZBKB-NEXT:    and a1, a1, a2
 ; RV64ZBKB-NEXT:    or a0, a1, a0
@@ -307,10 +307,10 @@ define i64 @pack_i64_allWUsers(i32 signext %0, i32 signext %1, i32 signext %2) {
 ; RV64I-LABEL: pack_i64_allWUsers:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    slli a2, a2, 32
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    slli a2, a2, 32
 ; RV64I-NEXT:    srli a2, a2, 32
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
index 972cc9d66bfb71..fc9be949884511 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
@@ -524,8 +524,8 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; ILP32-NEXT:    sw a6, 40(sp)
 ; ILP32-NEXT:    sw a7, 44(sp)
 ; ILP32-NEXT:    addi a1, a0, 7
-; ILP32-NEXT:    andi a1, a1, -8
 ; ILP32-NEXT:    addi a0, a0, 15
+; ILP32-NEXT:    andi a1, a1, -8
 ; ILP32-NEXT:    sw a0, 12(sp)
 ; ILP32-NEXT:    lw a0, 0(a1)
 ; ILP32-NEXT:    lw a1, 4(a1)
@@ -618,8 +618,8 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; RV64-NEXT:    sd a6, 64(sp)
 ; RV64-NEXT:    sd a7, 72(sp)
 ; RV64-NEXT:    addi a1, a0, 7
-; RV64-NEXT:    andi a1, a1, -8
 ; RV64-NEXT:    addi a0, a0, 15
+; RV64-NEXT:    andi a1, a1, -8
 ; RV64-NEXT:    sd a0, 8(sp)
 ; RV64-NEXT:    ld a0, 0(a1)
 ; RV64-NEXT:    addi sp, sp, 80
@@ -642,8 +642,8 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; RV32-WITHFP-NEXT:    sw a6, 24(s0)
 ; RV32-WITHFP-NEXT:    sw a7, 28(s0)
 ; RV32-WITHFP-NEXT:    addi a1, a0, 7
-; RV32-WITHFP-NEXT:    andi a1, a1, -8
 ; RV32-WITHFP-NEXT:    addi a0, a0, 15
+; RV32-WITHFP-NEXT:    andi a1, a1, -8
 ; RV32-WITHFP-NEXT:    sw a0, -12(s0)
 ; RV32-WITHFP-NEXT:    lw a0, 0(a1)
 ; RV32-WITHFP-NEXT:    lw a1, 4(a1)
@@ -669,8 +669,8 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; RV64-WITHFP-NEXT:    sd a6, 48(s0)
 ; RV64-WITHFP-NEXT:    sd a7, 56(s0)
 ; RV64-WITHFP-NEXT:    addi a1, a0, 7
-; RV64-WITHFP-NEXT:    andi a1, a1, -8
 ; RV64-WITHFP-NEXT:    addi a0, a0, 15
+; RV64-WITHFP-NEXT:    andi a1, a1, -8
 ; RV64-WITHFP-NEXT:    sd a0, -24(s0)
 ; RV64-WITHFP-NEXT:    ld a0, 0(a1)
 ; RV64-WITHFP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -863,8 +863,8 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; ILP32-NEXT:    sw a6, 24(sp)
 ; ILP32-NEXT:    sw a7, 28(sp)
 ; ILP32-NEXT:    addi a3, a0, 7
-; ILP32-NEXT:    andi a3, a3, -8
 ; ILP32-NEXT:    addi a0, a0, 15
+; ILP32-NEXT:    andi a3, a3, -8
 ; ILP32-NEXT:    sw a0, 4(sp)
 ; ILP32-NEXT:    lw a4, 0(a3)
 ; ILP32-NEXT:    lw a3, 4(a3)
@@ -966,8 +966,8 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; RV64-NEXT:    sd a6, 48(sp)
 ; RV64-NEXT:    sd a7, 56(sp)
 ; RV64-NEXT:    addi a2, a0, 7
-; RV64-NEXT:    andi a2, a2, -8
 ; RV64-NEXT:    addi a0, a0, 15
+; RV64-NEXT:    andi a2, a2, -8
 ; RV64-NEXT:    sd a0, 8(sp)
 ; RV64-NEXT:    ld a0, 0(a2)
 ; RV64-NEXT:    add a0, a1, a0
@@ -989,8 +989,8 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; RV32-WITHFP-NEXT:    sw a6, 16(s0)
 ; RV32-WITHFP-NEXT:    sw a7, 20(s0)
 ; RV32-WITHFP-NEXT:    addi a3, a0, 7
-; RV32-WITHFP-NEXT:    andi a3, a3, -8
 ; RV32-WITHFP-NEXT:    addi a0, a0, 15
+; RV32-WITHFP-NEXT:    andi a3, a3, -8
 ; RV32-WITHFP-NEXT:    sw a0, -12(s0)
 ; RV32-WITHFP-NEXT:    lw a4, 0(a3)
 ; RV32-WITHFP-NEXT:    lw a3, 4(a3)
@@ -1019,8 +1019,8 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; RV64-WITHFP-NEXT:    sd a6, 32(s0)
 ; RV64-WITHFP-NEXT:    sd a7, 40(s0)
 ; RV64-WITHFP-NEXT:    addi a2, a0, 7
-; RV64-WITHFP-NEXT:    andi a2, a2, -8
 ; RV64-WITHFP-NEXT:    addi a0, a0, 15
+; RV64-WITHFP-NEXT:    andi a2, a2, -8
 ; RV64-WITHFP-NEXT:    sd a0, -24(s0)
 ; RV64-WITHFP-NEXT:    ld a0, 0(a2)
 ; RV64-WITHFP-NEXT:    add a0, a1, a0
@@ -1169,9 +1169,9 @@ define void @va3_caller() nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-NEXT:    lui a0, 5
-; RV64-NEXT:    addiw a2, a0, -480
+; RV64-NEXT:    lui a1, 5
 ; RV64-NEXT:    li a0, 2
+; RV64-NEXT:    addiw a2, a1, -480
 ; RV64-NEXT:    li a1, 1111
 ; RV64-NEXT:    call va3
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -1201,9 +1201,9 @@ define void @va3_caller() nounwind {
 ; RV64-WITHFP-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64-WITHFP-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
 ; RV64-WITHFP-NEXT:    addi s0, sp, 16
-; RV64-WITHFP-NEXT:    lui a0, 5
-; RV64-WITHFP-NEXT:    addiw a2, a0, -480
+; RV64-WITHFP-NEXT:    lui a1, 5
 ; RV64-WITHFP-NEXT:    li a0, 2
+; RV64-WITHFP-NEXT:    addiw a2, a1, -480
 ; RV64-WITHFP-NEXT:    li a1, 1111
 ; RV64-WITHFP-NEXT:    call va3
 ; RV64-WITHFP-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/abds-neg.ll b/llvm/test/CodeGen/RISCV/abds-neg.ll
index e7fd87bd783876..c9a48acb8d14a9 100644
--- a/llvm/test/CodeGen/RISCV/abds-neg.ll
+++ b/llvm/test/CodeGen/RISCV/abds-neg.ll
@@ -11,8 +11,8 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -23,8 +23,8 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -62,8 +62,8 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i8_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -74,8 +74,8 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i8_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -113,8 +113,8 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i8_undef:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -125,8 +125,8 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i8_undef:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -164,8 +164,8 @@ define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -176,8 +176,8 @@ define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -266,8 +266,8 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i16_undef:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -278,8 +278,8 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i16_undef:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -625,11 +625,11 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
 ; RV32I-NEXT:    lw t1, 12(a1)
-; RV32I-NEXT:    lw a7, 8(a2)
+; RV32I-NEXT:    lw t0, 8(a2)
 ; RV32I-NEXT:    lw t2, 12(a2)
 ; RV32I-NEXT:    lw a1, 0(a2)
 ; RV32I-NEXT:    lw a2, 4(a2)
-; RV32I-NEXT:    sltu t3, a7, a6
+; RV32I-NEXT:    sltu t3, t0, a6
 ; RV32I-NEXT:    mv t4, t3
 ; RV32I-NEXT:    beq t1, t2, .LBB11_2
 ; RV32I-NEXT:  # %bb.1:
@@ -637,19 +637,19 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  .LBB11_2:
 ; RV32I-NEXT:    sltu a5, a1, a3
 ; RV32I-NEXT:    sltu t6, a2, a4
-; RV32I-NEXT:    mv t0, a5
+; RV32I-NEXT:    mv a7, a5
 ; RV32I-NEXT:    beq a4, a2, .LBB11_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    mv t0, t6
+; RV32I-NEXT:    mv a7, t6
 ; RV32I-NEXT:  .LBB11_4:
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    xor t5, t1, t2
-; RV32I-NEXT:    xor s0, a6, a7
+; RV32I-NEXT:    xor s0, a6, t0
 ; RV32I-NEXT:    or t5, s0, t5
 ; RV32I-NEXT:    beqz t5, .LBB11_6
 ; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    mv t0, t4
+; RV32I-NEXT:    mv a7, t4
 ; RV32I-NEXT:  .LBB11_6:
 ; RV32I-NEXT:    mv t5, a5
 ; RV32I-NEXT:    beq a2, a4, .LBB11_8
@@ -662,27 +662,27 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    sltu t6, a4, a2
 ; RV32I-NEXT:  .LBB11_10:
-; RV32I-NEXT:    bnez t0, .LBB11_12
+; RV32I-NEXT:    bnez a7, .LBB11_12
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sub t1, t2, t1
-; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a7, t1, t3
+; RV32I-NEXT:    sub a6, t0, a6
+; RV32I-NEXT:    sub t0, t1, t3
 ; RV32I-NEXT:    sltu t1, a6, t5
-; RV32I-NEXT:    sub a7, a7, t1
+; RV32I-NEXT:    sub t0, t0, t1
 ; RV32I-NEXT:    sub a6, a6, t5
 ; RV32I-NEXT:    j .LBB11_13
 ; RV32I-NEXT:  .LBB11_12:
-; RV32I-NEXT:    sltu t3, a6, a7
+; RV32I-NEXT:    sltu t3, a6, t0
 ; RV32I-NEXT:    sub t1, t1, t2
-; RV32I-NEXT:    sub t1, t1, t3
-; RV32I-NEXT:    sub a6, a6, a7
-; RV32I-NEXT:    sltu a7, a6, t6
-; RV32I-NEXT:    sub a7, t1, a7
+; RV32I-NEXT:    sub a6, a6, t0
+; RV32I-NEXT:    sub t0, t1, t3
+; RV32I-NEXT:    sltu t1, a6, t6
+; RV32I-NEXT:    sub t0, t0, t1
 ; RV32I-NEXT:    sub a6, a6, t6
 ; RV32I-NEXT:  .LBB11_13:
 ; RV32I-NEXT:    snez t1, a6
-; RV32I-NEXT:    add a7, a7, t1
-; RV32I-NEXT:    bnez t0, .LBB11_15
+; RV32I-NEXT:    add t0, t0, t1
+; RV32I-NEXT:    bnez a7, .LBB11_15
 ; RV32I-NEXT:  # %bb.14:
 ; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    sub a2, a2, a5
@@ -694,20 +694,20 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:  .LBB11_16:
 ; RV32I-NEXT:    or a3, a1, a2
-; RV32I-NEXT:    snez a3, a3
 ; RV32I-NEXT:    neg a4, a6
-; RV32I-NEXT:    sltu a5, a4, a3
-; RV32I-NEXT:    neg a6, a7
-; RV32I-NEXT:    sub a5, a6, a5
+; RV32I-NEXT:    neg a5, t0
 ; RV32I-NEXT:    snez a6, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    snez a3, a3
 ; RV32I-NEXT:    add a2, a2, a6
+; RV32I-NEXT:    sltu a6, a4, a3
 ; RV32I-NEXT:    neg a2, a2
 ; RV32I-NEXT:    sub a4, a4, a3
-; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a3, a5, a6
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a2, 4(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a5, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -744,11 +744,11 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a6, 8(a1)
 ; RV32ZBB-NEXT:    lw t1, 12(a1)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
+; RV32ZBB-NEXT:    lw t0, 8(a2)
 ; RV32ZBB-NEXT:    lw t2, 12(a2)
 ; RV32ZBB-NEXT:    lw a1, 0(a2)
 ; RV32ZBB-NEXT:    lw a2, 4(a2)
-; RV32ZBB-NEXT:    sltu t3, a7, a6
+; RV32ZBB-NEXT:    sltu t3, t0, a6
 ; RV32ZBB-NEXT:    mv t4, t3
 ; RV32ZBB-NEXT:    beq t1, t2, .LBB11_2
 ; RV32ZBB-NEXT:  # %bb.1:
@@ -756,19 +756,19 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB11_2:
 ; RV32ZBB-NEXT:    sltu a5, a1, a3
 ; RV32ZBB-NEXT:    sltu t6, a2, a4
-; RV32ZBB-NEXT:    mv t0, a5
+; RV32ZBB-NEXT:    mv a7, a5
 ; RV32ZBB-NEXT:    beq a4, a2, .LBB11_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    mv t0, t6
+; RV32ZBB-NEXT:    mv a7, t6
 ; RV32ZBB-NEXT:  .LBB11_4:
 ; RV32ZBB-NEXT:    addi sp, sp, -16
 ; RV32ZBB-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32ZBB-NEXT:    xor t5, t1, t2
-; RV32ZBB-NEXT:    xor s0, a6, a7
+; RV32ZBB-NEXT:    xor s0, a6, t0
 ; RV32ZBB-NEXT:    or t5, s0, t5
 ; RV32ZBB-NEXT:    beqz t5, .LBB11_6
 ; RV32ZBB-NEXT:  # %bb.5:
-; RV32ZBB-NEXT:    mv t0, t4
+; RV32ZBB-NEXT:    mv a7, t4
 ; RV32ZBB-NEXT:  .LBB11_6:
 ; RV32ZBB-NEXT:    mv t5, a5
 ; RV32ZBB-NEXT:    beq a2, a4, .LBB11_8
@@ -781,27 +781,27 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    sltu t6, a4, a2
 ; RV32ZBB-NEXT:  .LBB11_10:
-; RV32ZBB-NEXT:    bnez t0, .LBB11_12
+; RV32ZBB-NEXT:    bnez a7, .LBB11_12
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sub t1, t2, t1
-; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a7, t1, t3
+; RV32ZBB-NEXT:    sub a6, t0, a6
+; RV32ZBB-NEXT:    sub t0, t1, t3
 ; RV32ZBB-NEXT:    sltu t1, a6, t5
-; RV32ZBB-NEXT:    sub a7, a7, t1
+; RV32ZBB-NEXT:    sub t0, t0, t1
 ; RV32ZBB-NEXT:    sub a6, a6, t5
 ; RV32ZBB-NEXT:    j .LBB11_13
 ; RV32ZBB-NEXT:  .LBB11_12:
-; RV32ZBB-NEXT:    sltu t3, a6, a7
+; RV32ZBB-NEXT:    sltu t3, a6, t0
 ; RV32ZBB-NEXT:    sub t1, t1, t2
-; RV32ZBB-NEXT:    sub t1, t1, t3
-; RV32ZBB-NEXT:    sub a6, a6, a7
-; RV32ZBB-NEXT:    sltu a7, a6, t6
-; RV32ZBB-NEXT:    sub a7, t1, a7
+; RV32ZBB-NEXT:    sub a6, a6, t0
+; RV32ZBB-NEXT:    sub t0, t1, t3
+; RV32ZBB-NEXT:    sltu t1, a6, t6
+; RV32ZBB-NEXT:    sub t0, t0, t1
 ; RV32ZBB-NEXT:    sub a6, a6, t6
 ; RV32ZBB-NEXT:  .LBB11_13:
 ; RV32ZBB-NEXT:    snez t1, a6
-; RV32ZBB-NEXT:    add a7, a7, t1
-; RV32ZBB-NEXT:    bnez t0, .LBB11_15
+; RV32ZBB-NEXT:    add t0, t0, t1
+; RV32ZBB-NEXT:    bnez a7, .LBB11_15
 ; RV32ZBB-NEXT:  # %bb.14:
 ; RV32ZBB-NEXT:    sub a2, a2, a4
 ; RV32ZBB-NEXT:    sub a2, a2, a5
@@ -813,20 +813,20 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:  .LBB11_16:
 ; RV32ZBB-NEXT:    or a3, a1, a2
-; RV32ZBB-NEXT:    snez a3, a3
 ; RV32ZBB-NEXT:    neg a4, a6
-; RV32ZBB-NEXT:    sltu a5, a4, a3
-; RV32ZBB-NEXT:    neg a6, a7
-; RV32ZBB-NEXT:    sub a5, a6, a5
+; RV32ZBB-NEXT:    neg a5, t0
 ; RV32ZBB-NEXT:    snez a6, a1
+; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    snez a3, a3
 ; RV32ZBB-NEXT:    add a2, a2, a6
+; RV32ZBB-NEXT:    sltu a6, a4, a3
 ; RV32ZBB-NEXT:    neg a2, a2
 ; RV32ZBB-NEXT:    sub a4, a4, a3
-; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    sub a3, a5, a6
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a2, 4(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a5, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
@@ -872,11 +872,11 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a6, 8(a1)
 ; RV32I-NEXT:    lw t1, 12(a1)
-; RV32I-NEXT:    lw a7, 8(a2)
+; RV32I-NEXT:    lw t0, 8(a2)
 ; RV32I-NEXT:    lw t2, 12(a2)
 ; RV32I-NEXT:    lw a1, 0(a2)
 ; RV32I-NEXT:    lw a2, 4(a2)
-; RV32I-NEXT:    sltu t3, a7, a6
+; RV32I-NEXT:    sltu t3, t0, a6
 ; RV32I-NEXT:    mv t4, t3
 ; RV32I-NEXT:    beq t1, t2, .LBB12_2
 ; RV32I-NEXT:  # %bb.1:
@@ -884,19 +884,19 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  .LBB12_2:
 ; RV32I-NEXT:    sltu a5, a1, a3
 ; RV32I-NEXT:    sltu t6, a2, a4
-; RV32I-NEXT:    mv t0, a5
+; RV32I-NEXT:    mv a7, a5
 ; RV32I-NEXT:    beq a4, a2, .LBB12_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    mv t0, t6
+; RV32I-NEXT:    mv a7, t6
 ; RV32I-NEXT:  .LBB12_4:
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    xor t5, t1, t2
-; RV32I-NEXT:    xor s0, a6, a7
+; RV32I-NEXT:    xor s0, a6, t0
 ; RV32I-NEXT:    or t5, s0, t5
 ; RV32I-NEXT:    beqz t5, .LBB12_6
 ; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    mv t0, t4
+; RV32I-NEXT:    mv a7, t4
 ; RV32I-NEXT:  .LBB12_6:
 ; RV32I-NEXT:    mv t5, a5
 ; RV32I-NEXT:    beq a2, a4, .LBB12_8
@@ -909,27 +909,27 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    sltu t6, a4, a2
 ; RV32I-NEXT:  .LBB12_10:
-; RV32I-NEXT:    bnez t0, .LBB12_12
+; RV32I-NEXT:    bnez a7, .LBB12_12
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sub t1, t2, t1
-; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a7, t1, t3
+; RV32I-NEXT:    sub a6, t0, a6
+; RV32I-NEXT:    sub t0, t1, t3
 ; RV32I-NEXT:    sltu t1, a6, t5
-; RV32I-NEXT:    sub a7, a7, t1
+; RV32I-NEXT:    sub t0, t0, t1
 ; RV32I-NEXT:    sub a6, a6, t5
 ; RV32I-NEXT:    j .LBB12_13
 ; RV32I-NEXT:  .LBB12_12:
-; RV32I-NEXT:    sltu t3, a6, a7
+; RV32I-NEXT:    sltu t3, a6, t0
 ; RV32I-NEXT:    sub t1, t1, t2
-; RV32I-NEXT:    sub t1, t1, t3
-; RV32I-NEXT:    sub a6, a6, a7
-; RV32I-NEXT:    sltu a7, a6, t6
-; RV32I-NEXT:    sub a7, t1, a7
+; RV32I-NEXT:    sub a6, a6, t0
+; RV32I-NEXT:    sub t0, t1, t3
+; RV32I-NEXT:    sltu t1, a6, t6
+; RV32I-NEXT:    sub t0, t0, t1
 ; RV32I-NEXT:    sub a6, a6, t6
 ; RV32I-NEXT:  .LBB12_13:
 ; RV32I-NEXT:    snez t1, a6
-; RV32I-NEXT:    add a7, a7, t1
-; RV32I-NEXT:    bnez t0, .LBB12_15
+; RV32I-NEXT:    add t0, t0, t1
+; RV32I-NEXT:    bnez a7, .LBB12_15
 ; RV32I-NEXT:  # %bb.14:
 ; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    sub a2, a2, a5
@@ -941,20 +941,20 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:  .LBB12_16:
 ; RV32I-NEXT:    or a3, a1, a2
-; RV32I-NEXT:    snez a3, a3
 ; RV32I-NEXT:    neg a4, a6
-; RV32I-NEXT:    sltu a5, a4, a3
-; RV32I-NEXT:    neg a6, a7
-; RV32I-NEXT:    sub a5, a6, a5
+; RV32I-NEXT:    neg a5, t0
 ; RV32I-NEXT:    snez a6, a1
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    snez a3, a3
 ; RV32I-NEXT:    add a2, a2, a6
+; RV32I-NEXT:    sltu a6, a4, a3
 ; RV32I-NEXT:    neg a2, a2
 ; RV32I-NEXT:    sub a4, a4, a3
-; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a3, a5, a6
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a2, 4(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a5, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -991,11 +991,11 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a6, 8(a1)
 ; RV32ZBB-NEXT:    lw t1, 12(a1)
-; RV32ZBB-NEXT:    lw a7, 8(a2)
+; RV32ZBB-NEXT:    lw t0, 8(a2)
 ; RV32ZBB-NEXT:    lw t2, 12(a2)
 ; RV32ZBB-NEXT:    lw a1, 0(a2)
 ; RV32ZBB-NEXT:    lw a2, 4(a2)
-; RV32ZBB-NEXT:    sltu t3, a7, a6
+; RV32ZBB-NEXT:    sltu t3, t0, a6
 ; RV32ZBB-NEXT:    mv t4, t3
 ; RV32ZBB-NEXT:    beq t1, t2, .LBB12_2
 ; RV32ZBB-NEXT:  # %bb.1:
@@ -1003,19 +1003,19 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB12_2:
 ; RV32ZBB-NEXT:    sltu a5, a1, a3
 ; RV32ZBB-NEXT:    sltu t6, a2, a4
-; RV32ZBB-NEXT:    mv t0, a5
+; RV32ZBB-NEXT:    mv a7, a5
 ; RV32ZBB-NEXT:    beq a4, a2, .LBB12_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    mv t0, t6
+; RV32ZBB-NEXT:    mv a7, t6
 ; RV32ZBB-NEXT:  .LBB12_4:
 ; RV32ZBB-NEXT:    addi sp, sp, -16
 ; RV32ZBB-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32ZBB-NEXT:    xor t5, t1, t2
-; RV32ZBB-NEXT:    xor s0, a6, a7
+; RV32ZBB-NEXT:    xor s0, a6, t0
 ; RV32ZBB-NEXT:    or t5, s0, t5
 ; RV32ZBB-NEXT:    beqz t5, .LBB12_6
 ; RV32ZBB-NEXT:  # %bb.5:
-; RV32ZBB-NEXT:    mv t0, t4
+; RV32ZBB-NEXT:    mv a7, t4
 ; RV32ZBB-NEXT:  .LBB12_6:
 ; RV32ZBB-NEXT:    mv t5, a5
 ; RV32ZBB-NEXT:    beq a2, a4, .LBB12_8
@@ -1028,27 +1028,27 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    sltu t6, a4, a2
 ; RV32ZBB-NEXT:  .LBB12_10:
-; RV32ZBB-NEXT:    bnez t0, .LBB12_12
+; RV32ZBB-NEXT:    bnez a7, .LBB12_12
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sub t1, t2, t1
-; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a7, t1, t3
+; RV32ZBB-NEXT:    sub a6, t0, a6
+; RV32ZBB-NEXT:    sub t0, t1, t3
 ; RV32ZBB-NEXT:    sltu t1, a6, t5
-; RV32ZBB-NEXT:    sub a7, a7, t1
+; RV32ZBB-NEXT:    sub t0, t0, t1
 ; RV32ZBB-NEXT:    sub a6, a6, t5
 ; RV32ZBB-NEXT:    j .LBB12_13
 ; RV32ZBB-NEXT:  .LBB12_12:
-; RV32ZBB-NEXT:    sltu t3, a6, a7
+; RV32ZBB-NEXT:    sltu t3, a6, t0
 ; RV32ZBB-NEXT:    sub t1, t1, t2
-; RV32ZBB-NEXT:    sub t1, t1, t3
-; RV32ZBB-NEXT:    sub a6, a6, a7
-; RV32ZBB-NEXT:    sltu a7, a6, t6
-; RV32ZBB-NEXT:    sub a7, t1, a7
+; RV32ZBB-NEXT:    sub a6, a6, t0
+; RV32ZBB-NEXT:    sub t0, t1, t3
+; RV32ZBB-NEXT:    sltu t1, a6, t6
+; RV32ZBB-NEXT:    sub t0, t0, t1
 ; RV32ZBB-NEXT:    sub a6, a6, t6
 ; RV32ZBB-NEXT:  .LBB12_13:
 ; RV32ZBB-NEXT:    snez t1, a6
-; RV32ZBB-NEXT:    add a7, a7, t1
-; RV32ZBB-NEXT:    bnez t0, .LBB12_15
+; RV32ZBB-NEXT:    add t0, t0, t1
+; RV32ZBB-NEXT:    bnez a7, .LBB12_15
 ; RV32ZBB-NEXT:  # %bb.14:
 ; RV32ZBB-NEXT:    sub a2, a2, a4
 ; RV32ZBB-NEXT:    sub a2, a2, a5
@@ -1060,20 +1060,20 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:  .LBB12_16:
 ; RV32ZBB-NEXT:    or a3, a1, a2
-; RV32ZBB-NEXT:    snez a3, a3
 ; RV32ZBB-NEXT:    neg a4, a6
-; RV32ZBB-NEXT:    sltu a5, a4, a3
-; RV32ZBB-NEXT:    neg a6, a7
-; RV32ZBB-NEXT:    sub a5, a6, a5
+; RV32ZBB-NEXT:    neg a5, t0
 ; RV32ZBB-NEXT:    snez a6, a1
+; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    snez a3, a3
 ; RV32ZBB-NEXT:    add a2, a2, a6
+; RV32ZBB-NEXT:    sltu a6, a4, a3
 ; RV32ZBB-NEXT:    neg a2, a2
 ; RV32ZBB-NEXT:    sub a4, a4, a3
-; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    sub a3, a5, a6
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a2, 4(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a5, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
@@ -1120,8 +1120,8 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: abd_minmax_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    mv a2, a0
 ; RV32I-NEXT:    bge a0, a1, .LBB13_3
@@ -1140,8 +1140,8 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: abd_minmax_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    mv a2, a0
 ; RV64I-NEXT:    bge a0, a1, .LBB13_3
@@ -1175,8 +1175,8 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_minmax_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    mv a2, a0
 ; RV32I-NEXT:    bge a0, a1, .LBB14_3
@@ -1195,8 +1195,8 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_minmax_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    mv a2, a0
 ; RV64I-NEXT:    bge a0, a1, .LBB14_3
@@ -1449,26 +1449,26 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    mv a5, t0
 ; RV32I-NEXT:    mv a4, a7
 ; RV32I-NEXT:  .LBB17_19:
-; RV32I-NEXT:    sltu a6, t3, a4
-; RV32I-NEXT:    sub a7, t4, a5
-; RV32I-NEXT:    sltu a5, a2, a1
-; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    mv a7, a5
+; RV32I-NEXT:    sltu a7, t3, a4
+; RV32I-NEXT:    sub a5, t4, a5
+; RV32I-NEXT:    sltu a6, a2, a1
+; RV32I-NEXT:    sub a5, a5, a7
+; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beq t1, a3, .LBB17_21
 ; RV32I-NEXT:  # %bb.20:
 ; RV32I-NEXT:    sltu a7, t1, a3
 ; RV32I-NEXT:  .LBB17_21:
 ; RV32I-NEXT:    sub a4, t3, a4
-; RV32I-NEXT:    sltu t0, a4, a7
-; RV32I-NEXT:    sub a6, a6, t0
-; RV32I-NEXT:    sub a4, a4, a7
 ; RV32I-NEXT:    sub a3, t1, a3
-; RV32I-NEXT:    sub a3, a3, a5
 ; RV32I-NEXT:    sub a2, a2, a1
+; RV32I-NEXT:    sltu a1, a4, a7
+; RV32I-NEXT:    sub a4, a4, a7
+; RV32I-NEXT:    sub a3, a3, a6
+; RV32I-NEXT:    sub a5, a5, a1
 ; RV32I-NEXT:    sw a2, 0(a0)
 ; RV32I-NEXT:    sw a3, 4(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a6, 12(a0)
+; RV32I-NEXT:    sw a5, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -1576,26 +1576,26 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    mv a5, t0
 ; RV32ZBB-NEXT:    mv a4, a7
 ; RV32ZBB-NEXT:  .LBB17_19:
-; RV32ZBB-NEXT:    sltu a6, t3, a4
-; RV32ZBB-NEXT:    sub a7, t4, a5
-; RV32ZBB-NEXT:    sltu a5, a2, a1
-; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    mv a7, a5
+; RV32ZBB-NEXT:    sltu a7, t3, a4
+; RV32ZBB-NEXT:    sub a5, t4, a5
+; RV32ZBB-NEXT:    sltu a6, a2, a1
+; RV32ZBB-NEXT:    sub a5, a5, a7
+; RV32ZBB-NEXT:    mv a7, a6
 ; RV32ZBB-NEXT:    beq t1, a3, .LBB17_21
 ; RV32ZBB-NEXT:  # %bb.20:
 ; RV32ZBB-NEXT:    sltu a7, t1, a3
 ; RV32ZBB-NEXT:  .LBB17_21:
 ; RV32ZBB-NEXT:    sub a4, t3, a4
-; RV32ZBB-NEXT:    sltu t0, a4, a7
-; RV32ZBB-NEXT:    sub a6, a6, t0
-; RV32ZBB-NEXT:    sub a4, a4, a7
 ; RV32ZBB-NEXT:    sub a3, t1, a3
-; RV32ZBB-NEXT:    sub a3, a3, a5
 ; RV32ZBB-NEXT:    sub a2, a2, a1
+; RV32ZBB-NEXT:    sltu a1, a4, a7
+; RV32ZBB-NEXT:    sub a4, a4, a7
+; RV32ZBB-NEXT:    sub a3, a3, a6
+; RV32ZBB-NEXT:    sub a5, a5, a1
 ; RV32ZBB-NEXT:    sw a2, 0(a0)
 ; RV32ZBB-NEXT:    sw a3, 4(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a6, 12(a0)
+; RV32ZBB-NEXT:    sw a5, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
@@ -1647,8 +1647,8 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a0, 24
-; RV32I-NEXT:    srai a2, a2, 24
 ; RV32I-NEXT:    slli a3, a1, 24
+; RV32I-NEXT:    srai a2, a2, 24
 ; RV32I-NEXT:    srai a3, a3, 24
 ; RV32I-NEXT:    bge a3, a2, .LBB18_2
 ; RV32I-NEXT:  # %bb.1:
@@ -1661,8 +1661,8 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: abd_cmp_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a0, 56
-; RV64I-NEXT:    srai a2, a2, 56
 ; RV64I-NEXT:    slli a3, a1, 56
+; RV64I-NEXT:    srai a2, a2, 56
 ; RV64I-NEXT:    srai a3, a3, 56
 ; RV64I-NEXT:    bge a3, a2, .LBB18_2
 ; RV64I-NEXT:  # %bb.1:
@@ -1694,8 +1694,8 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a1, 16
-; RV32I-NEXT:    srai a2, a2, 16
 ; RV32I-NEXT:    slli a3, a0, 16
+; RV32I-NEXT:    srai a2, a2, 16
 ; RV32I-NEXT:    srai a3, a3, 16
 ; RV32I-NEXT:    blt a3, a2, .LBB19_2
 ; RV32I-NEXT:  # %bb.1:
@@ -1708,8 +1708,8 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_cmp_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a1, 48
-; RV64I-NEXT:    srai a2, a2, 48
 ; RV64I-NEXT:    slli a3, a0, 48
+; RV64I-NEXT:    srai a2, a2, 48
 ; RV64I-NEXT:    srai a3, a3, 48
 ; RV64I-NEXT:    blt a3, a2, .LBB19_2
 ; RV64I-NEXT:  # %bb.1:
@@ -1898,30 +1898,30 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    sltu t1, a5, a6
 ; RV32I-NEXT:    sub a7, a7, t0
-; RV32I-NEXT:    sub a7, a7, t1
-; RV32I-NEXT:    sub a6, a5, a6
-; RV32I-NEXT:    sltu a5, a6, t5
-; RV32I-NEXT:    sub a5, a7, a5
-; RV32I-NEXT:    sub a6, a6, t5
+; RV32I-NEXT:    sub a5, a5, a6
 ; RV32I-NEXT:    sub a4, a4, a1
-; RV32I-NEXT:    sub a1, a4, t4
+; RV32I-NEXT:    sub a6, a7, t1
+; RV32I-NEXT:    sltu a7, a5, t5
+; RV32I-NEXT:    sub a1, a5, t5
+; RV32I-NEXT:    sub a5, a4, t4
+; RV32I-NEXT:    sub a4, a6, a7
 ; RV32I-NEXT:    sub a2, a3, a2
 ; RV32I-NEXT:    j .LBB22_11
 ; RV32I-NEXT:  .LBB22_10:
 ; RV32I-NEXT:    sub a7, t0, a7
-; RV32I-NEXT:    sub a6, a6, a5
-; RV32I-NEXT:    sub a5, a7, t1
-; RV32I-NEXT:    sltu a7, a6, t3
-; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub a5, a5, a7
-; RV32I-NEXT:    sub a6, a6, t3
-; RV32I-NEXT:    sub a1, a1, t2
+; RV32I-NEXT:    sub a5, a6, a5
+; RV32I-NEXT:    sub a4, a1, a4
+; RV32I-NEXT:    sub a6, a7, t1
+; RV32I-NEXT:    sltu a7, a5, t3
+; RV32I-NEXT:    sub a1, a5, t3
+; RV32I-NEXT:    sub a5, a4, t2
+; RV32I-NEXT:    sub a4, a6, a7
 ; RV32I-NEXT:    sub a2, a2, a3
 ; RV32I-NEXT:  .LBB22_11:
 ; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
-; RV32I-NEXT:    sw a6, 8(a0)
-; RV32I-NEXT:    sw a5, 12(a0)
+; RV32I-NEXT:    sw a5, 4(a0)
+; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    sw a4, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_cmp_i128:
@@ -1985,30 +1985,30 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    sltu t1, a5, a6
 ; RV32ZBB-NEXT:    sub a7, a7, t0
-; RV32ZBB-NEXT:    sub a7, a7, t1
-; RV32ZBB-NEXT:    sub a6, a5, a6
-; RV32ZBB-NEXT:    sltu a5, a6, t5
-; RV32ZBB-NEXT:    sub a5, a7, a5
-; RV32ZBB-NEXT:    sub a6, a6, t5
+; RV32ZBB-NEXT:    sub a5, a5, a6
 ; RV32ZBB-NEXT:    sub a4, a4, a1
-; RV32ZBB-NEXT:    sub a1, a4, t4
+; RV32ZBB-NEXT:    sub a6, a7, t1
+; RV32ZBB-NEXT:    sltu a7, a5, t5
+; RV32ZBB-NEXT:    sub a1, a5, t5
+; RV32ZBB-NEXT:    sub a5, a4, t4
+; RV32ZBB-NEXT:    sub a4, a6, a7
 ; RV32ZBB-NEXT:    sub a2, a3, a2
 ; RV32ZBB-NEXT:    j .LBB22_11
 ; RV32ZBB-NEXT:  .LBB22_10:
 ; RV32ZBB-NEXT:    sub a7, t0, a7
-; RV32ZBB-NEXT:    sub a6, a6, a5
-; RV32ZBB-NEXT:    sub a5, a7, t1
-; RV32ZBB-NEXT:    sltu a7, a6, t3
-; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub a5, a5, a7
-; RV32ZBB-NEXT:    sub a6, a6, t3
-; RV32ZBB-NEXT:    sub a1, a1, t2
+; RV32ZBB-NEXT:    sub a5, a6, a5
+; RV32ZBB-NEXT:    sub a4, a1, a4
+; RV32ZBB-NEXT:    sub a6, a7, t1
+; RV32ZBB-NEXT:    sltu a7, a5, t3
+; RV32ZBB-NEXT:    sub a1, a5, t3
+; RV32ZBB-NEXT:    sub a5, a4, t2
+; RV32ZBB-NEXT:    sub a4, a6, a7
 ; RV32ZBB-NEXT:    sub a2, a2, a3
 ; RV32ZBB-NEXT:  .LBB22_11:
 ; RV32ZBB-NEXT:    sw a2, 0(a0)
-; RV32ZBB-NEXT:    sw a1, 4(a0)
-; RV32ZBB-NEXT:    sw a6, 8(a0)
-; RV32ZBB-NEXT:    sw a5, 12(a0)
+; RV32ZBB-NEXT:    sw a5, 4(a0)
+; RV32ZBB-NEXT:    sw a1, 8(a0)
+; RV32ZBB-NEXT:    sw a4, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_cmp_i128:
@@ -2289,12 +2289,12 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sltu a4, a0, a2
 ; RV32I-NEXT:    sub a1, a1, a3
-; RV32I-NEXT:    sub a1, a1, a4
 ; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a1, a1, a4
 ; RV32I-NEXT:    srai a2, a1, 31
 ; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    sltu a3, a2, a0
 ; RV32I-NEXT:    xor a1, a1, a2
+; RV32I-NEXT:    sltu a3, a2, a0
 ; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a0, a2, a0
@@ -2312,12 +2312,12 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    sltu a4, a0, a2
 ; RV32ZBB-NEXT:    sub a1, a1, a3
-; RV32ZBB-NEXT:    sub a1, a1, a4
 ; RV32ZBB-NEXT:    sub a0, a0, a2
+; RV32ZBB-NEXT:    sub a1, a1, a4
 ; RV32ZBB-NEXT:    srai a2, a1, 31
 ; RV32ZBB-NEXT:    xor a0, a0, a2
-; RV32ZBB-NEXT:    sltu a3, a2, a0
 ; RV32ZBB-NEXT:    xor a1, a1, a2
+; RV32ZBB-NEXT:    sltu a3, a2, a0
 ; RV32ZBB-NEXT:    sub a1, a2, a1
 ; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    sub a0, a2, a0
@@ -2340,12 +2340,12 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sltu a4, a0, a2
 ; RV32I-NEXT:    sub a1, a1, a3
-; RV32I-NEXT:    sub a1, a1, a4
 ; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a1, a1, a4
 ; RV32I-NEXT:    srai a2, a1, 31
 ; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    sltu a3, a2, a0
 ; RV32I-NEXT:    xor a1, a1, a2
+; RV32I-NEXT:    sltu a3, a2, a0
 ; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a0, a2, a0
@@ -2363,12 +2363,12 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    sltu a4, a0, a2
 ; RV32ZBB-NEXT:    sub a1, a1, a3
-; RV32ZBB-NEXT:    sub a1, a1, a4
 ; RV32ZBB-NEXT:    sub a0, a0, a2
+; RV32ZBB-NEXT:    sub a1, a1, a4
 ; RV32ZBB-NEXT:    srai a2, a1, 31
 ; RV32ZBB-NEXT:    xor a0, a0, a2
-; RV32ZBB-NEXT:    sltu a3, a2, a0
 ; RV32ZBB-NEXT:    xor a1, a1, a2
+; RV32ZBB-NEXT:    sltu a3, a2, a0
 ; RV32ZBB-NEXT:    sub a1, a2, a1
 ; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    sub a0, a2, a0
@@ -2392,64 +2392,64 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a3, 0(a2)
 ; RV32I-NEXT:    lw a4, 4(a2)
 ; RV32I-NEXT:    lw a5, 8(a2)
-; RV32I-NEXT:    lw a7, 12(a2)
-; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a6, 12(a2)
+; RV32I-NEXT:    lw t0, 8(a1)
+; RV32I-NEXT:    lw t1, 12(a1)
 ; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu t1, a6, a5
-; RV32I-NEXT:    sub t0, t0, a7
-; RV32I-NEXT:    sltu a7, a2, a3
-; RV32I-NEXT:    sub t1, t0, t1
-; RV32I-NEXT:    mv t0, a7
-; RV32I-NEXT:    beq a1, a4, .LBB31_2
+; RV32I-NEXT:    lw a7, 4(a1)
+; RV32I-NEXT:    sltu a1, t0, a5
+; RV32I-NEXT:    sub t1, t1, a6
+; RV32I-NEXT:    sltu a6, a2, a3
+; RV32I-NEXT:    sub a1, t1, a1
+; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    beq a7, a4, .LBB31_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t0, a1, a4
+; RV32I-NEXT:    sltu t1, a7, a4
 ; RV32I-NEXT:  .LBB31_2:
-; RV32I-NEXT:    sub a5, a6, a5
-; RV32I-NEXT:    sltu a6, a5, t0
-; RV32I-NEXT:    sub a6, t1, a6
-; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub t1, a1, a7
-; RV32I-NEXT:    sub a4, a5, t0
+; RV32I-NEXT:    sub a5, t0, a5
+; RV32I-NEXT:    sub a4, a7, a4
 ; RV32I-NEXT:    sub a3, a2, a3
-; RV32I-NEXT:    srai a1, a6, 31
+; RV32I-NEXT:    sltu a2, a5, t1
+; RV32I-NEXT:    sub t0, a4, a6
+; RV32I-NEXT:    sub a4, a5, t1
+; RV32I-NEXT:    sub a5, a1, a2
+; RV32I-NEXT:    srai a1, a5, 31
 ; RV32I-NEXT:    xor a2, a4, a1
-; RV32I-NEXT:    sltu a4, a1, a2
-; RV32I-NEXT:    xor a5, a6, a1
-; RV32I-NEXT:    sub a5, a1, a5
-; RV32I-NEXT:    sub a4, a5, a4
-; RV32I-NEXT:    xor a3, a3, a1
-; RV32I-NEXT:    sltu a5, a1, a3
-; RV32I-NEXT:    xor a6, t1, a1
-; RV32I-NEXT:    mv a7, a5
-; RV32I-NEXT:    beqz t1, .LBB31_4
+; RV32I-NEXT:    xor a5, a5, a1
+; RV32I-NEXT:    xor a4, a3, a1
+; RV32I-NEXT:    sltu a3, a1, a2
+; RV32I-NEXT:    sub a6, a1, a5
+; RV32I-NEXT:    sltu a5, a1, a4
+; RV32I-NEXT:    sub a3, a6, a3
+; RV32I-NEXT:    xor a7, t0, a1
+; RV32I-NEXT:    mv a6, a5
+; RV32I-NEXT:    beqz t0, .LBB31_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu a7, a1, a6
+; RV32I-NEXT:    sltu a6, a1, a7
 ; RV32I-NEXT:  .LBB31_4:
 ; RV32I-NEXT:    sub a2, a1, a2
-; RV32I-NEXT:    sltu t0, a2, a7
-; RV32I-NEXT:    sub a4, a4, t0
-; RV32I-NEXT:    sub a2, a2, a7
-; RV32I-NEXT:    sub a6, a1, a6
-; RV32I-NEXT:    sub a5, a6, a5
-; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    sub a7, a1, a7
+; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    sltu a4, a2, a6
+; RV32I-NEXT:    sub a2, a2, a6
+; RV32I-NEXT:    sub a5, a7, a5
+; RV32I-NEXT:    sub a3, a3, a4
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_subnsw_i128:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sltu a4, a0, a2
 ; RV64I-NEXT:    sub a1, a1, a3
-; RV64I-NEXT:    sub a1, a1, a4
 ; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    sub a1, a1, a4
 ; RV64I-NEXT:    srai a2, a1, 63
 ; RV64I-NEXT:    xor a0, a0, a2
-; RV64I-NEXT:    sltu a3, a2, a0
 ; RV64I-NEXT:    xor a1, a1, a2
+; RV64I-NEXT:    sltu a3, a2, a0
 ; RV64I-NEXT:    sub a1, a2, a1
 ; RV64I-NEXT:    sub a1, a1, a3
 ; RV64I-NEXT:    sub a0, a2, a0
@@ -2460,64 +2460,64 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
 ; RV32ZBB-NEXT:    lw a4, 4(a2)
 ; RV32ZBB-NEXT:    lw a5, 8(a2)
-; RV32ZBB-NEXT:    lw a7, 12(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a1)
-; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a6, 12(a2)
+; RV32ZBB-NEXT:    lw t0, 8(a1)
+; RV32ZBB-NEXT:    lw t1, 12(a1)
 ; RV32ZBB-NEXT:    lw a2, 0(a1)
-; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu t1, a6, a5
-; RV32ZBB-NEXT:    sub t0, t0, a7
-; RV32ZBB-NEXT:    sltu a7, a2, a3
-; RV32ZBB-NEXT:    sub t1, t0, t1
-; RV32ZBB-NEXT:    mv t0, a7
-; RV32ZBB-NEXT:    beq a1, a4, .LBB31_2
+; RV32ZBB-NEXT:    lw a7, 4(a1)
+; RV32ZBB-NEXT:    sltu a1, t0, a5
+; RV32ZBB-NEXT:    sub t1, t1, a6
+; RV32ZBB-NEXT:    sltu a6, a2, a3
+; RV32ZBB-NEXT:    sub a1, t1, a1
+; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    beq a7, a4, .LBB31_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t0, a1, a4
+; RV32ZBB-NEXT:    sltu t1, a7, a4
 ; RV32ZBB-NEXT:  .LBB31_2:
-; RV32ZBB-NEXT:    sub a5, a6, a5
-; RV32ZBB-NEXT:    sltu a6, a5, t0
-; RV32ZBB-NEXT:    sub a6, t1, a6
-; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub t1, a1, a7
-; RV32ZBB-NEXT:    sub a4, a5, t0
+; RV32ZBB-NEXT:    sub a5, t0, a5
+; RV32ZBB-NEXT:    sub a4, a7, a4
 ; RV32ZBB-NEXT:    sub a3, a2, a3
-; RV32ZBB-NEXT:    srai a1, a6, 31
+; RV32ZBB-NEXT:    sltu a2, a5, t1
+; RV32ZBB-NEXT:    sub t0, a4, a6
+; RV32ZBB-NEXT:    sub a4, a5, t1
+; RV32ZBB-NEXT:    sub a5, a1, a2
+; RV32ZBB-NEXT:    srai a1, a5, 31
 ; RV32ZBB-NEXT:    xor a2, a4, a1
-; RV32ZBB-NEXT:    sltu a4, a1, a2
-; RV32ZBB-NEXT:    xor a5, a6, a1
-; RV32ZBB-NEXT:    sub a5, a1, a5
-; RV32ZBB-NEXT:    sub a4, a5, a4
-; RV32ZBB-NEXT:    xor a3, a3, a1
-; RV32ZBB-NEXT:    sltu a5, a1, a3
-; RV32ZBB-NEXT:    xor a6, t1, a1
-; RV32ZBB-NEXT:    mv a7, a5
-; RV32ZBB-NEXT:    beqz t1, .LBB31_4
+; RV32ZBB-NEXT:    xor a5, a5, a1
+; RV32ZBB-NEXT:    xor a4, a3, a1
+; RV32ZBB-NEXT:    sltu a3, a1, a2
+; RV32ZBB-NEXT:    sub a6, a1, a5
+; RV32ZBB-NEXT:    sltu a5, a1, a4
+; RV32ZBB-NEXT:    sub a3, a6, a3
+; RV32ZBB-NEXT:    xor a7, t0, a1
+; RV32ZBB-NEXT:    mv a6, a5
+; RV32ZBB-NEXT:    beqz t0, .LBB31_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu a7, a1, a6
+; RV32ZBB-NEXT:    sltu a6, a1, a7
 ; RV32ZBB-NEXT:  .LBB31_4:
 ; RV32ZBB-NEXT:    sub a2, a1, a2
-; RV32ZBB-NEXT:    sltu t0, a2, a7
-; RV32ZBB-NEXT:    sub a4, a4, t0
-; RV32ZBB-NEXT:    sub a2, a2, a7
-; RV32ZBB-NEXT:    sub a6, a1, a6
-; RV32ZBB-NEXT:    sub a5, a6, a5
-; RV32ZBB-NEXT:    sub a1, a1, a3
+; RV32ZBB-NEXT:    sub a7, a1, a7
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    sltu a4, a2, a6
+; RV32ZBB-NEXT:    sub a2, a2, a6
+; RV32ZBB-NEXT:    sub a5, a7, a5
+; RV32ZBB-NEXT:    sub a3, a3, a4
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a4, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_subnsw_i128:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    sltu a4, a0, a2
 ; RV64ZBB-NEXT:    sub a1, a1, a3
-; RV64ZBB-NEXT:    sub a1, a1, a4
 ; RV64ZBB-NEXT:    sub a0, a0, a2
+; RV64ZBB-NEXT:    sub a1, a1, a4
 ; RV64ZBB-NEXT:    srai a2, a1, 63
 ; RV64ZBB-NEXT:    xor a0, a0, a2
-; RV64ZBB-NEXT:    sltu a3, a2, a0
 ; RV64ZBB-NEXT:    xor a1, a1, a2
+; RV64ZBB-NEXT:    sltu a3, a2, a0
 ; RV64ZBB-NEXT:    sub a1, a2, a1
 ; RV64ZBB-NEXT:    sub a1, a1, a3
 ; RV64ZBB-NEXT:    sub a0, a2, a0
@@ -2534,64 +2534,64 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a3, 0(a2)
 ; RV32I-NEXT:    lw a4, 4(a2)
 ; RV32I-NEXT:    lw a5, 8(a2)
-; RV32I-NEXT:    lw a7, 12(a2)
-; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a6, 12(a2)
+; RV32I-NEXT:    lw t0, 8(a1)
+; RV32I-NEXT:    lw t1, 12(a1)
 ; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a1, 4(a1)
-; RV32I-NEXT:    sltu t1, a6, a5
-; RV32I-NEXT:    sub t0, t0, a7
-; RV32I-NEXT:    sltu a7, a2, a3
-; RV32I-NEXT:    sub t1, t0, t1
-; RV32I-NEXT:    mv t0, a7
-; RV32I-NEXT:    beq a1, a4, .LBB32_2
+; RV32I-NEXT:    lw a7, 4(a1)
+; RV32I-NEXT:    sltu a1, t0, a5
+; RV32I-NEXT:    sub t1, t1, a6
+; RV32I-NEXT:    sltu a6, a2, a3
+; RV32I-NEXT:    sub a1, t1, a1
+; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    beq a7, a4, .LBB32_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t0, a1, a4
+; RV32I-NEXT:    sltu t1, a7, a4
 ; RV32I-NEXT:  .LBB32_2:
-; RV32I-NEXT:    sub a5, a6, a5
-; RV32I-NEXT:    sltu a6, a5, t0
-; RV32I-NEXT:    sub a6, t1, a6
-; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub t1, a1, a7
-; RV32I-NEXT:    sub a4, a5, t0
+; RV32I-NEXT:    sub a5, t0, a5
+; RV32I-NEXT:    sub a4, a7, a4
 ; RV32I-NEXT:    sub a3, a2, a3
-; RV32I-NEXT:    srai a1, a6, 31
+; RV32I-NEXT:    sltu a2, a5, t1
+; RV32I-NEXT:    sub t0, a4, a6
+; RV32I-NEXT:    sub a4, a5, t1
+; RV32I-NEXT:    sub a5, a1, a2
+; RV32I-NEXT:    srai a1, a5, 31
 ; RV32I-NEXT:    xor a2, a4, a1
-; RV32I-NEXT:    sltu a4, a1, a2
-; RV32I-NEXT:    xor a5, a6, a1
-; RV32I-NEXT:    sub a5, a1, a5
-; RV32I-NEXT:    sub a4, a5, a4
-; RV32I-NEXT:    xor a3, a3, a1
-; RV32I-NEXT:    sltu a5, a1, a3
-; RV32I-NEXT:    xor a6, t1, a1
-; RV32I-NEXT:    mv a7, a5
-; RV32I-NEXT:    beqz t1, .LBB32_4
+; RV32I-NEXT:    xor a5, a5, a1
+; RV32I-NEXT:    xor a4, a3, a1
+; RV32I-NEXT:    sltu a3, a1, a2
+; RV32I-NEXT:    sub a6, a1, a5
+; RV32I-NEXT:    sltu a5, a1, a4
+; RV32I-NEXT:    sub a3, a6, a3
+; RV32I-NEXT:    xor a7, t0, a1
+; RV32I-NEXT:    mv a6, a5
+; RV32I-NEXT:    beqz t0, .LBB32_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu a7, a1, a6
+; RV32I-NEXT:    sltu a6, a1, a7
 ; RV32I-NEXT:  .LBB32_4:
 ; RV32I-NEXT:    sub a2, a1, a2
-; RV32I-NEXT:    sltu t0, a2, a7
-; RV32I-NEXT:    sub a4, a4, t0
-; RV32I-NEXT:    sub a2, a2, a7
-; RV32I-NEXT:    sub a6, a1, a6
-; RV32I-NEXT:    sub a5, a6, a5
-; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    sub a7, a1, a7
+; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    sltu a4, a2, a6
+; RV32I-NEXT:    sub a2, a2, a6
+; RV32I-NEXT:    sub a5, a7, a5
+; RV32I-NEXT:    sub a3, a3, a4
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_subnsw_i128_undef:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sltu a4, a0, a2
 ; RV64I-NEXT:    sub a1, a1, a3
-; RV64I-NEXT:    sub a1, a1, a4
 ; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    sub a1, a1, a4
 ; RV64I-NEXT:    srai a2, a1, 63
 ; RV64I-NEXT:    xor a0, a0, a2
-; RV64I-NEXT:    sltu a3, a2, a0
 ; RV64I-NEXT:    xor a1, a1, a2
+; RV64I-NEXT:    sltu a3, a2, a0
 ; RV64I-NEXT:    sub a1, a2, a1
 ; RV64I-NEXT:    sub a1, a1, a3
 ; RV64I-NEXT:    sub a0, a2, a0
@@ -2602,64 +2602,64 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    lw a3, 0(a2)
 ; RV32ZBB-NEXT:    lw a4, 4(a2)
 ; RV32ZBB-NEXT:    lw a5, 8(a2)
-; RV32ZBB-NEXT:    lw a7, 12(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a1)
-; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a6, 12(a2)
+; RV32ZBB-NEXT:    lw t0, 8(a1)
+; RV32ZBB-NEXT:    lw t1, 12(a1)
 ; RV32ZBB-NEXT:    lw a2, 0(a1)
-; RV32ZBB-NEXT:    lw a1, 4(a1)
-; RV32ZBB-NEXT:    sltu t1, a6, a5
-; RV32ZBB-NEXT:    sub t0, t0, a7
-; RV32ZBB-NEXT:    sltu a7, a2, a3
-; RV32ZBB-NEXT:    sub t1, t0, t1
-; RV32ZBB-NEXT:    mv t0, a7
-; RV32ZBB-NEXT:    beq a1, a4, .LBB32_2
+; RV32ZBB-NEXT:    lw a7, 4(a1)
+; RV32ZBB-NEXT:    sltu a1, t0, a5
+; RV32ZBB-NEXT:    sub t1, t1, a6
+; RV32ZBB-NEXT:    sltu a6, a2, a3
+; RV32ZBB-NEXT:    sub a1, t1, a1
+; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    beq a7, a4, .LBB32_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t0, a1, a4
+; RV32ZBB-NEXT:    sltu t1, a7, a4
 ; RV32ZBB-NEXT:  .LBB32_2:
-; RV32ZBB-NEXT:    sub a5, a6, a5
-; RV32ZBB-NEXT:    sltu a6, a5, t0
-; RV32ZBB-NEXT:    sub a6, t1, a6
-; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub t1, a1, a7
-; RV32ZBB-NEXT:    sub a4, a5, t0
+; RV32ZBB-NEXT:    sub a5, t0, a5
+; RV32ZBB-NEXT:    sub a4, a7, a4
 ; RV32ZBB-NEXT:    sub a3, a2, a3
-; RV32ZBB-NEXT:    srai a1, a6, 31
+; RV32ZBB-NEXT:    sltu a2, a5, t1
+; RV32ZBB-NEXT:    sub t0, a4, a6
+; RV32ZBB-NEXT:    sub a4, a5, t1
+; RV32ZBB-NEXT:    sub a5, a1, a2
+; RV32ZBB-NEXT:    srai a1, a5, 31
 ; RV32ZBB-NEXT:    xor a2, a4, a1
-; RV32ZBB-NEXT:    sltu a4, a1, a2
-; RV32ZBB-NEXT:    xor a5, a6, a1
-; RV32ZBB-NEXT:    sub a5, a1, a5
-; RV32ZBB-NEXT:    sub a4, a5, a4
-; RV32ZBB-NEXT:    xor a3, a3, a1
-; RV32ZBB-NEXT:    sltu a5, a1, a3
-; RV32ZBB-NEXT:    xor a6, t1, a1
-; RV32ZBB-NEXT:    mv a7, a5
-; RV32ZBB-NEXT:    beqz t1, .LBB32_4
+; RV32ZBB-NEXT:    xor a5, a5, a1
+; RV32ZBB-NEXT:    xor a4, a3, a1
+; RV32ZBB-NEXT:    sltu a3, a1, a2
+; RV32ZBB-NEXT:    sub a6, a1, a5
+; RV32ZBB-NEXT:    sltu a5, a1, a4
+; RV32ZBB-NEXT:    sub a3, a6, a3
+; RV32ZBB-NEXT:    xor a7, t0, a1
+; RV32ZBB-NEXT:    mv a6, a5
+; RV32ZBB-NEXT:    beqz t0, .LBB32_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu a7, a1, a6
+; RV32ZBB-NEXT:    sltu a6, a1, a7
 ; RV32ZBB-NEXT:  .LBB32_4:
 ; RV32ZBB-NEXT:    sub a2, a1, a2
-; RV32ZBB-NEXT:    sltu t0, a2, a7
-; RV32ZBB-NEXT:    sub a4, a4, t0
-; RV32ZBB-NEXT:    sub a2, a2, a7
-; RV32ZBB-NEXT:    sub a6, a1, a6
-; RV32ZBB-NEXT:    sub a5, a6, a5
-; RV32ZBB-NEXT:    sub a1, a1, a3
+; RV32ZBB-NEXT:    sub a7, a1, a7
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    sltu a4, a2, a6
+; RV32ZBB-NEXT:    sub a2, a2, a6
+; RV32ZBB-NEXT:    sub a5, a7, a5
+; RV32ZBB-NEXT:    sub a3, a3, a4
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a4, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_subnsw_i128_undef:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    sltu a4, a0, a2
 ; RV64ZBB-NEXT:    sub a1, a1, a3
-; RV64ZBB-NEXT:    sub a1, a1, a4
 ; RV64ZBB-NEXT:    sub a0, a0, a2
+; RV64ZBB-NEXT:    sub a1, a1, a4
 ; RV64ZBB-NEXT:    srai a2, a1, 63
 ; RV64ZBB-NEXT:    xor a0, a0, a2
-; RV64ZBB-NEXT:    sltu a3, a2, a0
 ; RV64ZBB-NEXT:    xor a1, a1, a2
+; RV64ZBB-NEXT:    sltu a3, a2, a0
 ; RV64ZBB-NEXT:    sub a1, a2, a1
 ; RV64ZBB-NEXT:    sub a1, a1, a3
 ; RV64ZBB-NEXT:    sub a0, a2, a0

diff  --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll
index 8208eafbc205cc..56e6dacff97486 100644
--- a/llvm/test/CodeGen/RISCV/abds.ll
+++ b/llvm/test/CodeGen/RISCV/abds.ll
@@ -12,8 +12,8 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -24,8 +24,8 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -53,8 +53,8 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i8_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -65,8 +65,8 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i8_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -94,8 +94,8 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i8_undef:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -106,8 +106,8 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i8_undef:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -135,8 +135,8 @@ define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -147,8 +147,8 @@ define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -224,8 +224,8 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i16_undef:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -236,8 +236,8 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i16_undef:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -579,30 +579,30 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sub t0, t1, t0
 ; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a2, t0, a2
-; RV32I-NEXT:    sltu a7, a6, t4
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a5, a3
 ; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub a1, a1, t2
+; RV32I-NEXT:    sub a4, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t4
+; RV32I-NEXT:    sub a2, a1, t2
+; RV32I-NEXT:    sub a1, a4, a5
 ; RV32I-NEXT:    sub a4, a6, t4
 ; RV32I-NEXT:    j .LBB11_13
 ; RV32I-NEXT:  .LBB11_12:
 ; RV32I-NEXT:    sltu a2, a6, a7
 ; RV32I-NEXT:    sub t0, t0, t1
-; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    sub a6, a6, a7
-; RV32I-NEXT:    sltu a7, a6, t6
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a3, a5
 ; RV32I-NEXT:    sub a4, a4, a1
-; RV32I-NEXT:    sub a1, a4, t5
+; RV32I-NEXT:    sub a1, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t6
+; RV32I-NEXT:    sub a2, a4, t5
+; RV32I-NEXT:    sub a1, a1, a5
 ; RV32I-NEXT:    sub a4, a6, t6
 ; RV32I-NEXT:  .LBB11_13:
 ; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -675,30 +675,30 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sub t0, t1, t0
 ; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a2, t0, a2
-; RV32ZBB-NEXT:    sltu a7, a6, t4
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a5, a3
 ; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub a1, a1, t2
+; RV32ZBB-NEXT:    sub a4, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t4
+; RV32ZBB-NEXT:    sub a2, a1, t2
+; RV32ZBB-NEXT:    sub a1, a4, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t4
 ; RV32ZBB-NEXT:    j .LBB11_13
 ; RV32ZBB-NEXT:  .LBB11_12:
 ; RV32ZBB-NEXT:    sltu a2, a6, a7
 ; RV32ZBB-NEXT:    sub t0, t0, t1
-; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    sub a6, a6, a7
-; RV32ZBB-NEXT:    sltu a7, a6, t6
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a3, a5
 ; RV32ZBB-NEXT:    sub a4, a4, a1
-; RV32ZBB-NEXT:    sub a1, a4, t5
+; RV32ZBB-NEXT:    sub a1, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t6
+; RV32ZBB-NEXT:    sub a2, a4, t5
+; RV32ZBB-NEXT:    sub a1, a1, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t6
 ; RV32ZBB-NEXT:  .LBB11_13:
 ; RV32ZBB-NEXT:    sw a3, 0(a0)
-; RV32ZBB-NEXT:    sw a1, 4(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
@@ -779,30 +779,30 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sub t0, t1, t0
 ; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a2, t0, a2
-; RV32I-NEXT:    sltu a7, a6, t4
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a5, a3
 ; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub a1, a1, t2
+; RV32I-NEXT:    sub a4, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t4
+; RV32I-NEXT:    sub a2, a1, t2
+; RV32I-NEXT:    sub a1, a4, a5
 ; RV32I-NEXT:    sub a4, a6, t4
 ; RV32I-NEXT:    j .LBB12_13
 ; RV32I-NEXT:  .LBB12_12:
 ; RV32I-NEXT:    sltu a2, a6, a7
 ; RV32I-NEXT:    sub t0, t0, t1
-; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    sub a6, a6, a7
-; RV32I-NEXT:    sltu a7, a6, t6
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a3, a5
 ; RV32I-NEXT:    sub a4, a4, a1
-; RV32I-NEXT:    sub a1, a4, t5
+; RV32I-NEXT:    sub a1, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t6
+; RV32I-NEXT:    sub a2, a4, t5
+; RV32I-NEXT:    sub a1, a1, a5
 ; RV32I-NEXT:    sub a4, a6, t6
 ; RV32I-NEXT:  .LBB12_13:
 ; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -875,30 +875,30 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sub t0, t1, t0
 ; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a2, t0, a2
-; RV32ZBB-NEXT:    sltu a7, a6, t4
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a5, a3
 ; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub a1, a1, t2
+; RV32ZBB-NEXT:    sub a4, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t4
+; RV32ZBB-NEXT:    sub a2, a1, t2
+; RV32ZBB-NEXT:    sub a1, a4, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t4
 ; RV32ZBB-NEXT:    j .LBB12_13
 ; RV32ZBB-NEXT:  .LBB12_12:
 ; RV32ZBB-NEXT:    sltu a2, a6, a7
 ; RV32ZBB-NEXT:    sub t0, t0, t1
-; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    sub a6, a6, a7
-; RV32ZBB-NEXT:    sltu a7, a6, t6
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a3, a5
 ; RV32ZBB-NEXT:    sub a4, a4, a1
-; RV32ZBB-NEXT:    sub a1, a4, t5
+; RV32ZBB-NEXT:    sub a1, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t6
+; RV32ZBB-NEXT:    sub a2, a4, t5
+; RV32ZBB-NEXT:    sub a1, a1, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t6
 ; RV32ZBB-NEXT:  .LBB12_13:
 ; RV32ZBB-NEXT:    sw a3, 0(a0)
-; RV32ZBB-NEXT:    sw a1, 4(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
@@ -939,8 +939,8 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: abd_minmax_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -951,8 +951,8 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: abd_minmax_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -978,8 +978,8 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_minmax_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -990,8 +990,8 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_minmax_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -1168,30 +1168,30 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sub t0, t1, t0
 ; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a2, t0, a2
-; RV32I-NEXT:    sltu a7, a6, t4
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a5, a3
 ; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub a1, a1, t2
+; RV32I-NEXT:    sub a4, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t4
+; RV32I-NEXT:    sub a2, a1, t2
+; RV32I-NEXT:    sub a1, a4, a5
 ; RV32I-NEXT:    sub a4, a6, t4
 ; RV32I-NEXT:    j .LBB17_13
 ; RV32I-NEXT:  .LBB17_12:
 ; RV32I-NEXT:    sltu a2, a6, a7
 ; RV32I-NEXT:    sub t0, t0, t1
-; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    sub a6, a6, a7
-; RV32I-NEXT:    sltu a7, a6, t6
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a3, a5
 ; RV32I-NEXT:    sub a4, a4, a1
-; RV32I-NEXT:    sub a1, a4, t5
+; RV32I-NEXT:    sub a1, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t6
+; RV32I-NEXT:    sub a2, a4, t5
+; RV32I-NEXT:    sub a1, a1, a5
 ; RV32I-NEXT:    sub a4, a6, t6
 ; RV32I-NEXT:  .LBB17_13:
 ; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -1264,30 +1264,30 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sub t0, t1, t0
 ; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a2, t0, a2
-; RV32ZBB-NEXT:    sltu a7, a6, t4
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a5, a3
 ; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub a1, a1, t2
+; RV32ZBB-NEXT:    sub a4, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t4
+; RV32ZBB-NEXT:    sub a2, a1, t2
+; RV32ZBB-NEXT:    sub a1, a4, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t4
 ; RV32ZBB-NEXT:    j .LBB17_13
 ; RV32ZBB-NEXT:  .LBB17_12:
 ; RV32ZBB-NEXT:    sltu a2, a6, a7
 ; RV32ZBB-NEXT:    sub t0, t0, t1
-; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    sub a6, a6, a7
-; RV32ZBB-NEXT:    sltu a7, a6, t6
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a3, a5
 ; RV32ZBB-NEXT:    sub a4, a4, a1
-; RV32ZBB-NEXT:    sub a1, a4, t5
+; RV32ZBB-NEXT:    sub a1, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t6
+; RV32ZBB-NEXT:    sub a2, a4, t5
+; RV32ZBB-NEXT:    sub a1, a1, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t6
 ; RV32ZBB-NEXT:  .LBB17_13:
 ; RV32ZBB-NEXT:    sw a3, 0(a0)
-; RV32ZBB-NEXT:    sw a1, 4(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
@@ -1326,8 +1326,8 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -1338,8 +1338,8 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: abd_cmp_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -1366,8 +1366,8 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_cmp_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -1378,8 +1378,8 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_cmp_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -1559,30 +1559,30 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sub t0, t1, t0
 ; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a2, t0, a2
-; RV32I-NEXT:    sltu a7, a6, t4
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a5, a3
 ; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub a1, a1, t2
+; RV32I-NEXT:    sub a4, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t4
+; RV32I-NEXT:    sub a2, a1, t2
+; RV32I-NEXT:    sub a1, a4, a5
 ; RV32I-NEXT:    sub a4, a6, t4
 ; RV32I-NEXT:    j .LBB22_13
 ; RV32I-NEXT:  .LBB22_12:
 ; RV32I-NEXT:    sltu a2, a6, a7
 ; RV32I-NEXT:    sub t0, t0, t1
-; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    sub a6, a6, a7
-; RV32I-NEXT:    sltu a7, a6, t6
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a3, a5
 ; RV32I-NEXT:    sub a4, a4, a1
-; RV32I-NEXT:    sub a1, a4, t5
+; RV32I-NEXT:    sub a1, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t6
+; RV32I-NEXT:    sub a2, a4, t5
+; RV32I-NEXT:    sub a1, a1, a5
 ; RV32I-NEXT:    sub a4, a6, t6
 ; RV32I-NEXT:  .LBB22_13:
 ; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -1655,30 +1655,30 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sub t0, t1, t0
 ; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a2, t0, a2
-; RV32ZBB-NEXT:    sltu a7, a6, t4
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a5, a3
 ; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub a1, a1, t2
+; RV32ZBB-NEXT:    sub a4, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t4
+; RV32ZBB-NEXT:    sub a2, a1, t2
+; RV32ZBB-NEXT:    sub a1, a4, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t4
 ; RV32ZBB-NEXT:    j .LBB22_13
 ; RV32ZBB-NEXT:  .LBB22_12:
 ; RV32ZBB-NEXT:    sltu a2, a6, a7
 ; RV32ZBB-NEXT:    sub t0, t0, t1
-; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    sub a6, a6, a7
-; RV32ZBB-NEXT:    sltu a7, a6, t6
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a3, a5
 ; RV32ZBB-NEXT:    sub a4, a4, a1
-; RV32ZBB-NEXT:    sub a1, a4, t5
+; RV32ZBB-NEXT:    sub a1, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t6
+; RV32ZBB-NEXT:    sub a2, a4, t5
+; RV32ZBB-NEXT:    sub a1, a1, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t6
 ; RV32ZBB-NEXT:  .LBB22_13:
 ; RV32ZBB-NEXT:    sw a3, 0(a0)
-; RV32ZBB-NEXT:    sw a1, 4(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
@@ -2045,47 +2045,47 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a4, 0(a2)
 ; RV32I-NEXT:    lw a3, 4(a2)
-; RV32I-NEXT:    lw a6, 8(a2)
-; RV32I-NEXT:    lw t0, 12(a2)
-; RV32I-NEXT:    lw a2, 8(a1)
-; RV32I-NEXT:    lw t1, 12(a1)
-; RV32I-NEXT:    lw a5, 0(a1)
-; RV32I-NEXT:    lw a7, 4(a1)
-; RV32I-NEXT:    sltu a1, a2, a6
-; RV32I-NEXT:    sub t1, t1, t0
-; RV32I-NEXT:    sltu t0, a5, a4
-; RV32I-NEXT:    sub a1, t1, a1
-; RV32I-NEXT:    mv t1, t0
-; RV32I-NEXT:    beq a7, a3, .LBB31_2
+; RV32I-NEXT:    lw a5, 8(a2)
+; RV32I-NEXT:    lw a6, 12(a2)
+; RV32I-NEXT:    lw a7, 8(a1)
+; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a2, 0(a1)
+; RV32I-NEXT:    lw a1, 4(a1)
+; RV32I-NEXT:    sltu t1, a7, a5
+; RV32I-NEXT:    sub t0, t0, a6
+; RV32I-NEXT:    sltu a6, a2, a4
+; RV32I-NEXT:    sub t0, t0, t1
+; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    beq a1, a3, .LBB31_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a7, a3
+; RV32I-NEXT:    sltu t1, a1, a3
 ; RV32I-NEXT:  .LBB31_2:
-; RV32I-NEXT:    sub a2, a2, a6
-; RV32I-NEXT:    sltu a6, a2, t1
-; RV32I-NEXT:    sub a1, a1, a6
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    sub a3, a7, a3
-; RV32I-NEXT:    sub a3, a3, t0
-; RV32I-NEXT:    sub a4, a5, a4
+; RV32I-NEXT:    sub a5, a7, a5
+; RV32I-NEXT:    sub a3, a1, a3
+; RV32I-NEXT:    sltu a1, a5, t1
+; RV32I-NEXT:    sub a5, a5, t1
+; RV32I-NEXT:    sub a1, t0, a1
+; RV32I-NEXT:    sub a3, a3, a6
+; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    bgez a1, .LBB31_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    snez a5, a3
-; RV32I-NEXT:    snez a6, a4
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    neg a7, a2
-; RV32I-NEXT:    sltu t0, a7, a5
-; RV32I-NEXT:    snez a2, a2
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a1, a1, t0
-; RV32I-NEXT:    sub a2, a7, a5
-; RV32I-NEXT:    neg a4, a4
+; RV32I-NEXT:    snez a4, a3
+; RV32I-NEXT:    snez a6, a2
+; RV32I-NEXT:    neg a7, a5
+; RV32I-NEXT:    snez a5, a5
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    add a1, a1, a5
 ; RV32I-NEXT:    add a3, a3, a6
+; RV32I-NEXT:    sltu a6, a7, a4
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a5, a7, a4
+; RV32I-NEXT:    sub a1, a1, a6
 ; RV32I-NEXT:    neg a3, a3
 ; RV32I-NEXT:  .LBB31_4:
-; RV32I-NEXT:    sw a4, 0(a0)
+; RV32I-NEXT:    sw a2, 0(a0)
 ; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a2, 8(a0)
+; RV32I-NEXT:    sw a5, 8(a0)
 ; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    ret
 ;
@@ -2108,47 +2108,47 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a4, 0(a2)
 ; RV32ZBB-NEXT:    lw a3, 4(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a2)
-; RV32ZBB-NEXT:    lw t0, 12(a2)
-; RV32ZBB-NEXT:    lw a2, 8(a1)
-; RV32ZBB-NEXT:    lw t1, 12(a1)
-; RV32ZBB-NEXT:    lw a5, 0(a1)
-; RV32ZBB-NEXT:    lw a7, 4(a1)
-; RV32ZBB-NEXT:    sltu a1, a2, a6
-; RV32ZBB-NEXT:    sub t1, t1, t0
-; RV32ZBB-NEXT:    sltu t0, a5, a4
-; RV32ZBB-NEXT:    sub a1, t1, a1
-; RV32ZBB-NEXT:    mv t1, t0
-; RV32ZBB-NEXT:    beq a7, a3, .LBB31_2
+; RV32ZBB-NEXT:    lw a5, 8(a2)
+; RV32ZBB-NEXT:    lw a6, 12(a2)
+; RV32ZBB-NEXT:    lw a7, 8(a1)
+; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a2, 0(a1)
+; RV32ZBB-NEXT:    lw a1, 4(a1)
+; RV32ZBB-NEXT:    sltu t1, a7, a5
+; RV32ZBB-NEXT:    sub t0, t0, a6
+; RV32ZBB-NEXT:    sltu a6, a2, a4
+; RV32ZBB-NEXT:    sub t0, t0, t1
+; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    beq a1, a3, .LBB31_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a7, a3
+; RV32ZBB-NEXT:    sltu t1, a1, a3
 ; RV32ZBB-NEXT:  .LBB31_2:
-; RV32ZBB-NEXT:    sub a2, a2, a6
-; RV32ZBB-NEXT:    sltu a6, a2, t1
-; RV32ZBB-NEXT:    sub a1, a1, a6
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    sub a3, a7, a3
-; RV32ZBB-NEXT:    sub a3, a3, t0
-; RV32ZBB-NEXT:    sub a4, a5, a4
+; RV32ZBB-NEXT:    sub a5, a7, a5
+; RV32ZBB-NEXT:    sub a3, a1, a3
+; RV32ZBB-NEXT:    sltu a1, a5, t1
+; RV32ZBB-NEXT:    sub a5, a5, t1
+; RV32ZBB-NEXT:    sub a1, t0, a1
+; RV32ZBB-NEXT:    sub a3, a3, a6
+; RV32ZBB-NEXT:    sub a2, a2, a4
 ; RV32ZBB-NEXT:    bgez a1, .LBB31_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    snez a5, a3
-; RV32ZBB-NEXT:    snez a6, a4
-; RV32ZBB-NEXT:    or a5, a6, a5
-; RV32ZBB-NEXT:    neg a7, a2
-; RV32ZBB-NEXT:    sltu t0, a7, a5
-; RV32ZBB-NEXT:    snez a2, a2
-; RV32ZBB-NEXT:    add a1, a1, a2
-; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a1, a1, t0
-; RV32ZBB-NEXT:    sub a2, a7, a5
-; RV32ZBB-NEXT:    neg a4, a4
+; RV32ZBB-NEXT:    snez a4, a3
+; RV32ZBB-NEXT:    snez a6, a2
+; RV32ZBB-NEXT:    neg a7, a5
+; RV32ZBB-NEXT:    snez a5, a5
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    or a4, a6, a4
+; RV32ZBB-NEXT:    add a1, a1, a5
 ; RV32ZBB-NEXT:    add a3, a3, a6
+; RV32ZBB-NEXT:    sltu a6, a7, a4
+; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    sub a5, a7, a4
+; RV32ZBB-NEXT:    sub a1, a1, a6
 ; RV32ZBB-NEXT:    neg a3, a3
 ; RV32ZBB-NEXT:  .LBB31_4:
-; RV32ZBB-NEXT:    sw a4, 0(a0)
+; RV32ZBB-NEXT:    sw a2, 0(a0)
 ; RV32ZBB-NEXT:    sw a3, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 8(a0)
+; RV32ZBB-NEXT:    sw a5, 8(a0)
 ; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
@@ -2176,47 +2176,47 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a4, 0(a2)
 ; RV32I-NEXT:    lw a3, 4(a2)
-; RV32I-NEXT:    lw a6, 8(a2)
-; RV32I-NEXT:    lw t0, 12(a2)
-; RV32I-NEXT:    lw a2, 8(a1)
-; RV32I-NEXT:    lw t1, 12(a1)
-; RV32I-NEXT:    lw a5, 0(a1)
-; RV32I-NEXT:    lw a7, 4(a1)
-; RV32I-NEXT:    sltu a1, a2, a6
-; RV32I-NEXT:    sub t1, t1, t0
-; RV32I-NEXT:    sltu t0, a5, a4
-; RV32I-NEXT:    sub a1, t1, a1
-; RV32I-NEXT:    mv t1, t0
-; RV32I-NEXT:    beq a7, a3, .LBB32_2
+; RV32I-NEXT:    lw a5, 8(a2)
+; RV32I-NEXT:    lw a6, 12(a2)
+; RV32I-NEXT:    lw a7, 8(a1)
+; RV32I-NEXT:    lw t0, 12(a1)
+; RV32I-NEXT:    lw a2, 0(a1)
+; RV32I-NEXT:    lw a1, 4(a1)
+; RV32I-NEXT:    sltu t1, a7, a5
+; RV32I-NEXT:    sub t0, t0, a6
+; RV32I-NEXT:    sltu a6, a2, a4
+; RV32I-NEXT:    sub t0, t0, t1
+; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    beq a1, a3, .LBB32_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a7, a3
+; RV32I-NEXT:    sltu t1, a1, a3
 ; RV32I-NEXT:  .LBB32_2:
-; RV32I-NEXT:    sub a2, a2, a6
-; RV32I-NEXT:    sltu a6, a2, t1
-; RV32I-NEXT:    sub a1, a1, a6
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    sub a3, a7, a3
-; RV32I-NEXT:    sub a3, a3, t0
-; RV32I-NEXT:    sub a4, a5, a4
+; RV32I-NEXT:    sub a5, a7, a5
+; RV32I-NEXT:    sub a3, a1, a3
+; RV32I-NEXT:    sltu a1, a5, t1
+; RV32I-NEXT:    sub a5, a5, t1
+; RV32I-NEXT:    sub a1, t0, a1
+; RV32I-NEXT:    sub a3, a3, a6
+; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    bgez a1, .LBB32_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    snez a5, a3
-; RV32I-NEXT:    snez a6, a4
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    neg a7, a2
-; RV32I-NEXT:    sltu t0, a7, a5
-; RV32I-NEXT:    snez a2, a2
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a1, a1, t0
-; RV32I-NEXT:    sub a2, a7, a5
-; RV32I-NEXT:    neg a4, a4
+; RV32I-NEXT:    snez a4, a3
+; RV32I-NEXT:    snez a6, a2
+; RV32I-NEXT:    neg a7, a5
+; RV32I-NEXT:    snez a5, a5
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    add a1, a1, a5
 ; RV32I-NEXT:    add a3, a3, a6
+; RV32I-NEXT:    sltu a6, a7, a4
+; RV32I-NEXT:    neg a1, a1
+; RV32I-NEXT:    sub a5, a7, a4
+; RV32I-NEXT:    sub a1, a1, a6
 ; RV32I-NEXT:    neg a3, a3
 ; RV32I-NEXT:  .LBB32_4:
-; RV32I-NEXT:    sw a4, 0(a0)
+; RV32I-NEXT:    sw a2, 0(a0)
 ; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a2, 8(a0)
+; RV32I-NEXT:    sw a5, 8(a0)
 ; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    ret
 ;
@@ -2239,47 +2239,47 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a4, 0(a2)
 ; RV32ZBB-NEXT:    lw a3, 4(a2)
-; RV32ZBB-NEXT:    lw a6, 8(a2)
-; RV32ZBB-NEXT:    lw t0, 12(a2)
-; RV32ZBB-NEXT:    lw a2, 8(a1)
-; RV32ZBB-NEXT:    lw t1, 12(a1)
-; RV32ZBB-NEXT:    lw a5, 0(a1)
-; RV32ZBB-NEXT:    lw a7, 4(a1)
-; RV32ZBB-NEXT:    sltu a1, a2, a6
-; RV32ZBB-NEXT:    sub t1, t1, t0
-; RV32ZBB-NEXT:    sltu t0, a5, a4
-; RV32ZBB-NEXT:    sub a1, t1, a1
-; RV32ZBB-NEXT:    mv t1, t0
-; RV32ZBB-NEXT:    beq a7, a3, .LBB32_2
+; RV32ZBB-NEXT:    lw a5, 8(a2)
+; RV32ZBB-NEXT:    lw a6, 12(a2)
+; RV32ZBB-NEXT:    lw a7, 8(a1)
+; RV32ZBB-NEXT:    lw t0, 12(a1)
+; RV32ZBB-NEXT:    lw a2, 0(a1)
+; RV32ZBB-NEXT:    lw a1, 4(a1)
+; RV32ZBB-NEXT:    sltu t1, a7, a5
+; RV32ZBB-NEXT:    sub t0, t0, a6
+; RV32ZBB-NEXT:    sltu a6, a2, a4
+; RV32ZBB-NEXT:    sub t0, t0, t1
+; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    beq a1, a3, .LBB32_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu t1, a7, a3
+; RV32ZBB-NEXT:    sltu t1, a1, a3
 ; RV32ZBB-NEXT:  .LBB32_2:
-; RV32ZBB-NEXT:    sub a2, a2, a6
-; RV32ZBB-NEXT:    sltu a6, a2, t1
-; RV32ZBB-NEXT:    sub a1, a1, a6
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    sub a3, a7, a3
-; RV32ZBB-NEXT:    sub a3, a3, t0
-; RV32ZBB-NEXT:    sub a4, a5, a4
+; RV32ZBB-NEXT:    sub a5, a7, a5
+; RV32ZBB-NEXT:    sub a3, a1, a3
+; RV32ZBB-NEXT:    sltu a1, a5, t1
+; RV32ZBB-NEXT:    sub a5, a5, t1
+; RV32ZBB-NEXT:    sub a1, t0, a1
+; RV32ZBB-NEXT:    sub a3, a3, a6
+; RV32ZBB-NEXT:    sub a2, a2, a4
 ; RV32ZBB-NEXT:    bgez a1, .LBB32_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    snez a5, a3
-; RV32ZBB-NEXT:    snez a6, a4
-; RV32ZBB-NEXT:    or a5, a6, a5
-; RV32ZBB-NEXT:    neg a7, a2
-; RV32ZBB-NEXT:    sltu t0, a7, a5
-; RV32ZBB-NEXT:    snez a2, a2
-; RV32ZBB-NEXT:    add a1, a1, a2
-; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a1, a1, t0
-; RV32ZBB-NEXT:    sub a2, a7, a5
-; RV32ZBB-NEXT:    neg a4, a4
+; RV32ZBB-NEXT:    snez a4, a3
+; RV32ZBB-NEXT:    snez a6, a2
+; RV32ZBB-NEXT:    neg a7, a5
+; RV32ZBB-NEXT:    snez a5, a5
+; RV32ZBB-NEXT:    neg a2, a2
+; RV32ZBB-NEXT:    or a4, a6, a4
+; RV32ZBB-NEXT:    add a1, a1, a5
 ; RV32ZBB-NEXT:    add a3, a3, a6
+; RV32ZBB-NEXT:    sltu a6, a7, a4
+; RV32ZBB-NEXT:    neg a1, a1
+; RV32ZBB-NEXT:    sub a5, a7, a4
+; RV32ZBB-NEXT:    sub a1, a1, a6
 ; RV32ZBB-NEXT:    neg a3, a3
 ; RV32ZBB-NEXT:  .LBB32_4:
-; RV32ZBB-NEXT:    sw a4, 0(a0)
+; RV32ZBB-NEXT:    sw a2, 0(a0)
 ; RV32ZBB-NEXT:    sw a3, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 8(a0)
+; RV32ZBB-NEXT:    sw a5, 8(a0)
 ; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
@@ -2349,8 +2349,8 @@ define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: abd_select_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -2361,8 +2361,8 @@ define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: abd_select_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -2389,8 +2389,8 @@ define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: abd_select_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    sub a0, a0, a1
 ; RV32I-NEXT:    srai a1, a0, 31
@@ -2401,8 +2401,8 @@ define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_select_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -2582,30 +2582,30 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.11:
 ; RV32I-NEXT:    sub t0, t1, t0
 ; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    sub a2, t0, a2
-; RV32I-NEXT:    sltu a7, a6, t4
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a5, a3
 ; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub a1, a1, t2
+; RV32I-NEXT:    sub a4, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t4
+; RV32I-NEXT:    sub a2, a1, t2
+; RV32I-NEXT:    sub a1, a4, a5
 ; RV32I-NEXT:    sub a4, a6, t4
 ; RV32I-NEXT:    j .LBB38_13
 ; RV32I-NEXT:  .LBB38_12:
 ; RV32I-NEXT:    sltu a2, a6, a7
 ; RV32I-NEXT:    sub t0, t0, t1
-; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    sub a6, a6, a7
-; RV32I-NEXT:    sltu a7, a6, t6
-; RV32I-NEXT:    sub a2, a2, a7
 ; RV32I-NEXT:    sub a3, a3, a5
 ; RV32I-NEXT:    sub a4, a4, a1
-; RV32I-NEXT:    sub a1, a4, t5
+; RV32I-NEXT:    sub a1, t0, a2
+; RV32I-NEXT:    sltu a5, a6, t6
+; RV32I-NEXT:    sub a2, a4, t5
+; RV32I-NEXT:    sub a1, a1, a5
 ; RV32I-NEXT:    sub a4, a6, t6
 ; RV32I-NEXT:  .LBB38_13:
 ; RV32I-NEXT:    sw a3, 0(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -2678,30 +2678,30 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.11:
 ; RV32ZBB-NEXT:    sub t0, t1, t0
 ; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    sub a2, t0, a2
-; RV32ZBB-NEXT:    sltu a7, a6, t4
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a5, a3
 ; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub a1, a1, t2
+; RV32ZBB-NEXT:    sub a4, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t4
+; RV32ZBB-NEXT:    sub a2, a1, t2
+; RV32ZBB-NEXT:    sub a1, a4, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t4
 ; RV32ZBB-NEXT:    j .LBB38_13
 ; RV32ZBB-NEXT:  .LBB38_12:
 ; RV32ZBB-NEXT:    sltu a2, a6, a7
 ; RV32ZBB-NEXT:    sub t0, t0, t1
-; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    sub a6, a6, a7
-; RV32ZBB-NEXT:    sltu a7, a6, t6
-; RV32ZBB-NEXT:    sub a2, a2, a7
 ; RV32ZBB-NEXT:    sub a3, a3, a5
 ; RV32ZBB-NEXT:    sub a4, a4, a1
-; RV32ZBB-NEXT:    sub a1, a4, t5
+; RV32ZBB-NEXT:    sub a1, t0, a2
+; RV32ZBB-NEXT:    sltu a5, a6, t6
+; RV32ZBB-NEXT:    sub a2, a4, t5
+; RV32ZBB-NEXT:    sub a1, a1, a5
 ; RV32ZBB-NEXT:    sub a4, a6, t6
 ; RV32ZBB-NEXT:  .LBB38_13:
 ; RV32ZBB-NEXT:    sw a3, 0(a0)
-; RV32ZBB-NEXT:    sw a1, 4(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/abdu-neg.ll b/llvm/test/CodeGen/RISCV/abdu-neg.ll
index b39285c3d343f5..9e41cde7ae181e 100644
--- a/llvm/test/CodeGen/RISCV/abdu-neg.ll
+++ b/llvm/test/CodeGen/RISCV/abdu-neg.ll
@@ -220,8 +220,8 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i16_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli a0, a0, 48
 ; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a0, a0, 48
 ; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -322,8 +322,8 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -341,8 +341,8 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 ; RV64ZBB-LABEL: abd_ext_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a0, a0, 32
-; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    slli a1, a1, 32
+; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    sub a0, a0, a1
 ; RV64ZBB-NEXT:    neg a1, a0
@@ -375,8 +375,8 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i32_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a1, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -426,8 +426,8 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i32_undef:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -445,8 +445,8 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 ; RV64ZBB-LABEL: abd_ext_i32_undef:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a0, a0, 32
-; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    slli a1, a1, 32
+; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    sub a0, a0, a1
 ; RV64ZBB-NEXT:    neg a1, a0
@@ -477,13 +477,13 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:  .LBB9_3:
 ; RV32I-NEXT:    neg a1, a0
 ; RV32I-NEXT:    xor a2, a2, a1
-; RV32I-NEXT:    sltu a4, a2, a1
-; RV32I-NEXT:    xor a1, a3, a1
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    xor a3, a3, a1
+; RV32I-NEXT:    sltu a1, a2, a1
+; RV32I-NEXT:    add a3, a3, a0
 ; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    snez a2, a0
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    sub a3, a3, a1
+; RV32I-NEXT:    snez a1, a0
+; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    neg a1, a1
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    ret
@@ -515,13 +515,13 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB9_3:
 ; RV32ZBB-NEXT:    neg a1, a0
 ; RV32ZBB-NEXT:    xor a2, a2, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a1
-; RV32ZBB-NEXT:    xor a1, a3, a1
-; RV32ZBB-NEXT:    add a1, a1, a0
-; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    xor a3, a3, a1
+; RV32ZBB-NEXT:    sltu a1, a2, a1
+; RV32ZBB-NEXT:    add a3, a3, a0
 ; RV32ZBB-NEXT:    add a0, a2, a0
-; RV32ZBB-NEXT:    snez a2, a0
-; RV32ZBB-NEXT:    add a1, a1, a2
+; RV32ZBB-NEXT:    sub a3, a3, a1
+; RV32ZBB-NEXT:    snez a1, a0
+; RV32ZBB-NEXT:    add a1, a3, a1
 ; RV32ZBB-NEXT:    neg a1, a1
 ; RV32ZBB-NEXT:    neg a0, a0
 ; RV32ZBB-NEXT:    ret
@@ -557,13 +557,13 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:  .LBB10_3:
 ; RV32I-NEXT:    neg a1, a0
 ; RV32I-NEXT:    xor a2, a2, a1
-; RV32I-NEXT:    sltu a4, a2, a1
-; RV32I-NEXT:    xor a1, a3, a1
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    xor a3, a3, a1
+; RV32I-NEXT:    sltu a1, a2, a1
+; RV32I-NEXT:    add a3, a3, a0
 ; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    snez a2, a0
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    sub a3, a3, a1
+; RV32I-NEXT:    snez a1, a0
+; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    neg a1, a1
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    ret
@@ -595,13 +595,13 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB10_3:
 ; RV32ZBB-NEXT:    neg a1, a0
 ; RV32ZBB-NEXT:    xor a2, a2, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a1
-; RV32ZBB-NEXT:    xor a1, a3, a1
-; RV32ZBB-NEXT:    add a1, a1, a0
-; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    xor a3, a3, a1
+; RV32ZBB-NEXT:    sltu a1, a2, a1
+; RV32ZBB-NEXT:    add a3, a3, a0
 ; RV32ZBB-NEXT:    add a0, a2, a0
-; RV32ZBB-NEXT:    snez a2, a0
-; RV32ZBB-NEXT:    add a1, a1, a2
+; RV32ZBB-NEXT:    sub a3, a3, a1
+; RV32ZBB-NEXT:    snez a1, a0
+; RV32ZBB-NEXT:    add a1, a3, a1
 ; RV32ZBB-NEXT:    neg a1, a1
 ; RV32ZBB-NEXT:    neg a0, a0
 ; RV32ZBB-NEXT:    ret
@@ -624,87 +624,87 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a4, 0(a2)
-; RV32I-NEXT:    lw a6, 4(a2)
-; RV32I-NEXT:    lw t1, 8(a2)
-; RV32I-NEXT:    lw a2, 12(a2)
-; RV32I-NEXT:    lw a3, 8(a1)
-; RV32I-NEXT:    lw a5, 12(a1)
-; RV32I-NEXT:    lw a7, 0(a1)
+; RV32I-NEXT:    lw a5, 0(a2)
+; RV32I-NEXT:    lw a7, 4(a2)
+; RV32I-NEXT:    lw a3, 8(a2)
+; RV32I-NEXT:    lw t1, 12(a2)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a6, 12(a1)
+; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    lw t0, 4(a1)
-; RV32I-NEXT:    sltu a1, a3, t1
-; RV32I-NEXT:    sub a2, a5, a2
-; RV32I-NEXT:    sltu t2, a7, a4
-; RV32I-NEXT:    sub a1, a2, a1
-; RV32I-NEXT:    mv a2, t2
-; RV32I-NEXT:    beq t0, a6, .LBB11_2
+; RV32I-NEXT:    sltu a1, a4, a3
+; RV32I-NEXT:    sub t1, a6, t1
+; RV32I-NEXT:    sltu t2, a2, a5
+; RV32I-NEXT:    sub a1, t1, a1
+; RV32I-NEXT:    mv t1, t2
+; RV32I-NEXT:    beq t0, a7, .LBB11_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu a2, t0, a6
+; RV32I-NEXT:    sltu t1, t0, a7
 ; RV32I-NEXT:  .LBB11_2:
-; RV32I-NEXT:    sub t1, a3, t1
-; RV32I-NEXT:    sltu t3, t1, a2
+; RV32I-NEXT:    sub a3, a4, a3
+; RV32I-NEXT:    sltu t3, a3, t1
 ; RV32I-NEXT:    sub a1, a1, t3
-; RV32I-NEXT:    sub a2, t1, a2
-; RV32I-NEXT:    beq a1, a5, .LBB11_4
+; RV32I-NEXT:    sub a3, a3, t1
+; RV32I-NEXT:    beq a1, a6, .LBB11_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t1, a5, a1
+; RV32I-NEXT:    sltu t1, a6, a1
 ; RV32I-NEXT:    j .LBB11_5
 ; RV32I-NEXT:  .LBB11_4:
-; RV32I-NEXT:    sltu t1, a3, a2
+; RV32I-NEXT:    sltu t1, a4, a3
 ; RV32I-NEXT:  .LBB11_5:
-; RV32I-NEXT:    sub a6, t0, a6
-; RV32I-NEXT:    sub a6, a6, t2
-; RV32I-NEXT:    sub t2, a7, a4
-; RV32I-NEXT:    beq a6, t0, .LBB11_7
+; RV32I-NEXT:    sub a7, t0, a7
+; RV32I-NEXT:    sub a7, a7, t2
+; RV32I-NEXT:    sub a5, a2, a5
+; RV32I-NEXT:    beq a7, t0, .LBB11_7
 ; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    sltu a4, t0, a6
+; RV32I-NEXT:    sltu a2, t0, a7
 ; RV32I-NEXT:    j .LBB11_8
 ; RV32I-NEXT:  .LBB11_7:
-; RV32I-NEXT:    sltu a4, a7, t2
+; RV32I-NEXT:    sltu a2, a2, a5
 ; RV32I-NEXT:  .LBB11_8:
-; RV32I-NEXT:    xor a5, a1, a5
-; RV32I-NEXT:    xor a3, a2, a3
-; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    beqz a3, .LBB11_10
+; RV32I-NEXT:    xor a6, a1, a6
+; RV32I-NEXT:    xor a4, a3, a4
+; RV32I-NEXT:    or a4, a4, a6
+; RV32I-NEXT:    beqz a4, .LBB11_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    mv a4, t1
+; RV32I-NEXT:    mv a2, t1
 ; RV32I-NEXT:  .LBB11_10:
-; RV32I-NEXT:    neg t0, a4
-; RV32I-NEXT:    xor a5, t2, t0
-; RV32I-NEXT:    sltu t2, a5, t0
-; RV32I-NEXT:    xor t3, a6, t0
-; RV32I-NEXT:    add a3, t3, a4
-; RV32I-NEXT:    sub a3, a3, t2
-; RV32I-NEXT:    snez t1, a3
-; RV32I-NEXT:    add a5, a5, a4
-; RV32I-NEXT:    snez a7, a5
-; RV32I-NEXT:    or t1, a7, t1
-; RV32I-NEXT:    beqz a6, .LBB11_12
+; RV32I-NEXT:    neg a4, a2
+; RV32I-NEXT:    xor t0, a5, a4
+; RV32I-NEXT:    xor t3, a7, a4
+; RV32I-NEXT:    sltu a5, t0, a4
+; RV32I-NEXT:    add a6, t3, a2
+; RV32I-NEXT:    add t0, t0, a2
+; RV32I-NEXT:    sub t1, a6, a5
+; RV32I-NEXT:    snez a6, t1
+; RV32I-NEXT:    snez t2, t0
+; RV32I-NEXT:    or a6, t2, a6
+; RV32I-NEXT:    beqz a7, .LBB11_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t2, t3, t0
+; RV32I-NEXT:    sltu a5, t3, a4
 ; RV32I-NEXT:  .LBB11_12:
-; RV32I-NEXT:    xor a2, a2, t0
-; RV32I-NEXT:    add a6, a2, a4
-; RV32I-NEXT:    sub t3, a6, t2
-; RV32I-NEXT:    neg t4, t3
-; RV32I-NEXT:    sltu t5, t4, t1
-; RV32I-NEXT:    sltu a2, a2, t0
-; RV32I-NEXT:    xor a1, a1, t0
-; RV32I-NEXT:    add a1, a1, a4
-; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    sltu a2, a6, t2
-; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    snez a2, t3
+; RV32I-NEXT:    xor a3, a3, a4
+; RV32I-NEXT:    xor a1, a1, a4
+; RV32I-NEXT:    add t1, t1, t2
+; RV32I-NEXT:    neg a7, t0
+; RV32I-NEXT:    add t0, a3, a2
+; RV32I-NEXT:    sltu a3, a3, a4
 ; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    neg a2, t1
+; RV32I-NEXT:    sub a4, t0, a5
+; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    sltu a3, t0, a5
+; RV32I-NEXT:    neg a5, a4
+; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    snez a3, a4
+; RV32I-NEXT:    sltu a4, a5, a6
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    sub a3, a5, a6
 ; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a1, a1, t5
-; RV32I-NEXT:    sub a2, t4, t1
-; RV32I-NEXT:    add a3, a3, a7
-; RV32I-NEXT:    neg a3, a3
-; RV32I-NEXT:    neg a4, a5
-; RV32I-NEXT:    sw a4, 0(a0)
-; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a2, 8(a0)
+; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    sw a7, 0(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
+; RV32I-NEXT:    sw a3, 8(a0)
 ; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    ret
 ;
@@ -723,100 +723,100 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:  .LBB11_3:
 ; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    xor a2, a2, a1
-; RV64I-NEXT:    sltu a4, a2, a1
-; RV64I-NEXT:    xor a1, a3, a1
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    xor a3, a3, a1
+; RV64I-NEXT:    sltu a1, a2, a1
+; RV64I-NEXT:    add a3, a3, a0
 ; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    snez a2, a0
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    sub a3, a3, a1
+; RV64I-NEXT:    snez a1, a0
+; RV64I-NEXT:    add a1, a3, a1
 ; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    neg a0, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: abd_ext_i128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a4, 0(a2)
-; RV32ZBB-NEXT:    lw a6, 4(a2)
-; RV32ZBB-NEXT:    lw t1, 8(a2)
-; RV32ZBB-NEXT:    lw a2, 12(a2)
-; RV32ZBB-NEXT:    lw a3, 8(a1)
-; RV32ZBB-NEXT:    lw a5, 12(a1)
-; RV32ZBB-NEXT:    lw a7, 0(a1)
+; RV32ZBB-NEXT:    lw a5, 0(a2)
+; RV32ZBB-NEXT:    lw a7, 4(a2)
+; RV32ZBB-NEXT:    lw a3, 8(a2)
+; RV32ZBB-NEXT:    lw t1, 12(a2)
+; RV32ZBB-NEXT:    lw a4, 8(a1)
+; RV32ZBB-NEXT:    lw a6, 12(a1)
+; RV32ZBB-NEXT:    lw a2, 0(a1)
 ; RV32ZBB-NEXT:    lw t0, 4(a1)
-; RV32ZBB-NEXT:    sltu a1, a3, t1
-; RV32ZBB-NEXT:    sub a2, a5, a2
-; RV32ZBB-NEXT:    sltu t2, a7, a4
-; RV32ZBB-NEXT:    sub a1, a2, a1
-; RV32ZBB-NEXT:    mv a2, t2
-; RV32ZBB-NEXT:    beq t0, a6, .LBB11_2
+; RV32ZBB-NEXT:    sltu a1, a4, a3
+; RV32ZBB-NEXT:    sub t1, a6, t1
+; RV32ZBB-NEXT:    sltu t2, a2, a5
+; RV32ZBB-NEXT:    sub a1, t1, a1
+; RV32ZBB-NEXT:    mv t1, t2
+; RV32ZBB-NEXT:    beq t0, a7, .LBB11_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu a2, t0, a6
+; RV32ZBB-NEXT:    sltu t1, t0, a7
 ; RV32ZBB-NEXT:  .LBB11_2:
-; RV32ZBB-NEXT:    sub t1, a3, t1
-; RV32ZBB-NEXT:    sltu t3, t1, a2
+; RV32ZBB-NEXT:    sub a3, a4, a3
+; RV32ZBB-NEXT:    sltu t3, a3, t1
 ; RV32ZBB-NEXT:    sub a1, a1, t3
-; RV32ZBB-NEXT:    sub a2, t1, a2
-; RV32ZBB-NEXT:    beq a1, a5, .LBB11_4
+; RV32ZBB-NEXT:    sub a3, a3, t1
+; RV32ZBB-NEXT:    beq a1, a6, .LBB11_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t1, a5, a1
+; RV32ZBB-NEXT:    sltu t1, a6, a1
 ; RV32ZBB-NEXT:    j .LBB11_5
 ; RV32ZBB-NEXT:  .LBB11_4:
-; RV32ZBB-NEXT:    sltu t1, a3, a2
+; RV32ZBB-NEXT:    sltu t1, a4, a3
 ; RV32ZBB-NEXT:  .LBB11_5:
-; RV32ZBB-NEXT:    sub a6, t0, a6
-; RV32ZBB-NEXT:    sub a6, a6, t2
-; RV32ZBB-NEXT:    sub t2, a7, a4
-; RV32ZBB-NEXT:    beq a6, t0, .LBB11_7
+; RV32ZBB-NEXT:    sub a7, t0, a7
+; RV32ZBB-NEXT:    sub a7, a7, t2
+; RV32ZBB-NEXT:    sub a5, a2, a5
+; RV32ZBB-NEXT:    beq a7, t0, .LBB11_7
 ; RV32ZBB-NEXT:  # %bb.6:
-; RV32ZBB-NEXT:    sltu a4, t0, a6
+; RV32ZBB-NEXT:    sltu a2, t0, a7
 ; RV32ZBB-NEXT:    j .LBB11_8
 ; RV32ZBB-NEXT:  .LBB11_7:
-; RV32ZBB-NEXT:    sltu a4, a7, t2
+; RV32ZBB-NEXT:    sltu a2, a2, a5
 ; RV32ZBB-NEXT:  .LBB11_8:
-; RV32ZBB-NEXT:    xor a5, a1, a5
-; RV32ZBB-NEXT:    xor a3, a2, a3
-; RV32ZBB-NEXT:    or a3, a3, a5
-; RV32ZBB-NEXT:    beqz a3, .LBB11_10
+; RV32ZBB-NEXT:    xor a6, a1, a6
+; RV32ZBB-NEXT:    xor a4, a3, a4
+; RV32ZBB-NEXT:    or a4, a4, a6
+; RV32ZBB-NEXT:    beqz a4, .LBB11_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    mv a4, t1
+; RV32ZBB-NEXT:    mv a2, t1
 ; RV32ZBB-NEXT:  .LBB11_10:
-; RV32ZBB-NEXT:    neg t0, a4
-; RV32ZBB-NEXT:    xor a5, t2, t0
-; RV32ZBB-NEXT:    sltu t2, a5, t0
-; RV32ZBB-NEXT:    xor t3, a6, t0
-; RV32ZBB-NEXT:    add a3, t3, a4
-; RV32ZBB-NEXT:    sub a3, a3, t2
-; RV32ZBB-NEXT:    snez t1, a3
-; RV32ZBB-NEXT:    add a5, a5, a4
-; RV32ZBB-NEXT:    snez a7, a5
-; RV32ZBB-NEXT:    or t1, a7, t1
-; RV32ZBB-NEXT:    beqz a6, .LBB11_12
+; RV32ZBB-NEXT:    neg a4, a2
+; RV32ZBB-NEXT:    xor t0, a5, a4
+; RV32ZBB-NEXT:    xor t3, a7, a4
+; RV32ZBB-NEXT:    sltu a5, t0, a4
+; RV32ZBB-NEXT:    add a6, t3, a2
+; RV32ZBB-NEXT:    add t0, t0, a2
+; RV32ZBB-NEXT:    sub t1, a6, a5
+; RV32ZBB-NEXT:    snez a6, t1
+; RV32ZBB-NEXT:    snez t2, t0
+; RV32ZBB-NEXT:    or a6, t2, a6
+; RV32ZBB-NEXT:    beqz a7, .LBB11_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t2, t3, t0
+; RV32ZBB-NEXT:    sltu a5, t3, a4
 ; RV32ZBB-NEXT:  .LBB11_12:
-; RV32ZBB-NEXT:    xor a2, a2, t0
-; RV32ZBB-NEXT:    add a6, a2, a4
-; RV32ZBB-NEXT:    sub t3, a6, t2
-; RV32ZBB-NEXT:    neg t4, t3
-; RV32ZBB-NEXT:    sltu t5, t4, t1
-; RV32ZBB-NEXT:    sltu a2, a2, t0
-; RV32ZBB-NEXT:    xor a1, a1, t0
-; RV32ZBB-NEXT:    add a1, a1, a4
-; RV32ZBB-NEXT:    sub a1, a1, a2
-; RV32ZBB-NEXT:    sltu a2, a6, t2
-; RV32ZBB-NEXT:    sub a1, a1, a2
-; RV32ZBB-NEXT:    snez a2, t3
+; RV32ZBB-NEXT:    xor a3, a3, a4
+; RV32ZBB-NEXT:    xor a1, a1, a4
+; RV32ZBB-NEXT:    add t1, t1, t2
+; RV32ZBB-NEXT:    neg a7, t0
+; RV32ZBB-NEXT:    add t0, a3, a2
+; RV32ZBB-NEXT:    sltu a3, a3, a4
 ; RV32ZBB-NEXT:    add a1, a1, a2
+; RV32ZBB-NEXT:    neg a2, t1
+; RV32ZBB-NEXT:    sub a4, t0, a5
+; RV32ZBB-NEXT:    sub a1, a1, a3
+; RV32ZBB-NEXT:    sltu a3, t0, a5
+; RV32ZBB-NEXT:    neg a5, a4
+; RV32ZBB-NEXT:    sub a1, a1, a3
+; RV32ZBB-NEXT:    snez a3, a4
+; RV32ZBB-NEXT:    sltu a4, a5, a6
+; RV32ZBB-NEXT:    add a1, a1, a3
+; RV32ZBB-NEXT:    sub a3, a5, a6
 ; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a1, a1, t5
-; RV32ZBB-NEXT:    sub a2, t4, t1
-; RV32ZBB-NEXT:    add a3, a3, a7
-; RV32ZBB-NEXT:    neg a3, a3
-; RV32ZBB-NEXT:    neg a4, a5
-; RV32ZBB-NEXT:    sw a4, 0(a0)
-; RV32ZBB-NEXT:    sw a3, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 8(a0)
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    sw a7, 0(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
+; RV32ZBB-NEXT:    sw a3, 8(a0)
 ; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
@@ -835,13 +835,13 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV64ZBB-NEXT:  .LBB11_3:
 ; RV64ZBB-NEXT:    neg a1, a0
 ; RV64ZBB-NEXT:    xor a2, a2, a1
-; RV64ZBB-NEXT:    sltu a4, a2, a1
-; RV64ZBB-NEXT:    xor a1, a3, a1
-; RV64ZBB-NEXT:    add a1, a1, a0
-; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    xor a3, a3, a1
+; RV64ZBB-NEXT:    sltu a1, a2, a1
+; RV64ZBB-NEXT:    add a3, a3, a0
 ; RV64ZBB-NEXT:    add a0, a2, a0
-; RV64ZBB-NEXT:    snez a2, a0
-; RV64ZBB-NEXT:    add a1, a1, a2
+; RV64ZBB-NEXT:    sub a3, a3, a1
+; RV64ZBB-NEXT:    snez a1, a0
+; RV64ZBB-NEXT:    add a1, a3, a1
 ; RV64ZBB-NEXT:    neg a1, a1
 ; RV64ZBB-NEXT:    neg a0, a0
 ; RV64ZBB-NEXT:    ret
@@ -857,87 +857,87 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-LABEL: abd_ext_i128_undef:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a4, 0(a2)
-; RV32I-NEXT:    lw a6, 4(a2)
-; RV32I-NEXT:    lw t1, 8(a2)
-; RV32I-NEXT:    lw a2, 12(a2)
-; RV32I-NEXT:    lw a3, 8(a1)
-; RV32I-NEXT:    lw a5, 12(a1)
-; RV32I-NEXT:    lw a7, 0(a1)
+; RV32I-NEXT:    lw a5, 0(a2)
+; RV32I-NEXT:    lw a7, 4(a2)
+; RV32I-NEXT:    lw a3, 8(a2)
+; RV32I-NEXT:    lw t1, 12(a2)
+; RV32I-NEXT:    lw a4, 8(a1)
+; RV32I-NEXT:    lw a6, 12(a1)
+; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    lw t0, 4(a1)
-; RV32I-NEXT:    sltu a1, a3, t1
-; RV32I-NEXT:    sub a2, a5, a2
-; RV32I-NEXT:    sltu t2, a7, a4
-; RV32I-NEXT:    sub a1, a2, a1
-; RV32I-NEXT:    mv a2, t2
-; RV32I-NEXT:    beq t0, a6, .LBB12_2
+; RV32I-NEXT:    sltu a1, a4, a3
+; RV32I-NEXT:    sub t1, a6, t1
+; RV32I-NEXT:    sltu t2, a2, a5
+; RV32I-NEXT:    sub a1, t1, a1
+; RV32I-NEXT:    mv t1, t2
+; RV32I-NEXT:    beq t0, a7, .LBB12_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu a2, t0, a6
+; RV32I-NEXT:    sltu t1, t0, a7
 ; RV32I-NEXT:  .LBB12_2:
-; RV32I-NEXT:    sub t1, a3, t1
-; RV32I-NEXT:    sltu t3, t1, a2
+; RV32I-NEXT:    sub a3, a4, a3
+; RV32I-NEXT:    sltu t3, a3, t1
 ; RV32I-NEXT:    sub a1, a1, t3
-; RV32I-NEXT:    sub a2, t1, a2
-; RV32I-NEXT:    beq a1, a5, .LBB12_4
+; RV32I-NEXT:    sub a3, a3, t1
+; RV32I-NEXT:    beq a1, a6, .LBB12_4
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    sltu t1, a5, a1
+; RV32I-NEXT:    sltu t1, a6, a1
 ; RV32I-NEXT:    j .LBB12_5
 ; RV32I-NEXT:  .LBB12_4:
-; RV32I-NEXT:    sltu t1, a3, a2
+; RV32I-NEXT:    sltu t1, a4, a3
 ; RV32I-NEXT:  .LBB12_5:
-; RV32I-NEXT:    sub a6, t0, a6
-; RV32I-NEXT:    sub a6, a6, t2
-; RV32I-NEXT:    sub t2, a7, a4
-; RV32I-NEXT:    beq a6, t0, .LBB12_7
+; RV32I-NEXT:    sub a7, t0, a7
+; RV32I-NEXT:    sub a7, a7, t2
+; RV32I-NEXT:    sub a5, a2, a5
+; RV32I-NEXT:    beq a7, t0, .LBB12_7
 ; RV32I-NEXT:  # %bb.6:
-; RV32I-NEXT:    sltu a4, t0, a6
+; RV32I-NEXT:    sltu a2, t0, a7
 ; RV32I-NEXT:    j .LBB12_8
 ; RV32I-NEXT:  .LBB12_7:
-; RV32I-NEXT:    sltu a4, a7, t2
+; RV32I-NEXT:    sltu a2, a2, a5
 ; RV32I-NEXT:  .LBB12_8:
-; RV32I-NEXT:    xor a5, a1, a5
-; RV32I-NEXT:    xor a3, a2, a3
-; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    beqz a3, .LBB12_10
+; RV32I-NEXT:    xor a6, a1, a6
+; RV32I-NEXT:    xor a4, a3, a4
+; RV32I-NEXT:    or a4, a4, a6
+; RV32I-NEXT:    beqz a4, .LBB12_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    mv a4, t1
+; RV32I-NEXT:    mv a2, t1
 ; RV32I-NEXT:  .LBB12_10:
-; RV32I-NEXT:    neg t0, a4
-; RV32I-NEXT:    xor a5, t2, t0
-; RV32I-NEXT:    sltu t2, a5, t0
-; RV32I-NEXT:    xor t3, a6, t0
-; RV32I-NEXT:    add a3, t3, a4
-; RV32I-NEXT:    sub a3, a3, t2
-; RV32I-NEXT:    snez t1, a3
-; RV32I-NEXT:    add a5, a5, a4
-; RV32I-NEXT:    snez a7, a5
-; RV32I-NEXT:    or t1, a7, t1
-; RV32I-NEXT:    beqz a6, .LBB12_12
+; RV32I-NEXT:    neg a4, a2
+; RV32I-NEXT:    xor t0, a5, a4
+; RV32I-NEXT:    xor t3, a7, a4
+; RV32I-NEXT:    sltu a5, t0, a4
+; RV32I-NEXT:    add a6, t3, a2
+; RV32I-NEXT:    add t0, t0, a2
+; RV32I-NEXT:    sub t1, a6, a5
+; RV32I-NEXT:    snez a6, t1
+; RV32I-NEXT:    snez t2, t0
+; RV32I-NEXT:    or a6, t2, a6
+; RV32I-NEXT:    beqz a7, .LBB12_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t2, t3, t0
+; RV32I-NEXT:    sltu a5, t3, a4
 ; RV32I-NEXT:  .LBB12_12:
-; RV32I-NEXT:    xor a2, a2, t0
-; RV32I-NEXT:    add a6, a2, a4
-; RV32I-NEXT:    sub t3, a6, t2
-; RV32I-NEXT:    neg t4, t3
-; RV32I-NEXT:    sltu t5, t4, t1
-; RV32I-NEXT:    sltu a2, a2, t0
-; RV32I-NEXT:    xor a1, a1, t0
-; RV32I-NEXT:    add a1, a1, a4
-; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    sltu a2, a6, t2
-; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    snez a2, t3
+; RV32I-NEXT:    xor a3, a3, a4
+; RV32I-NEXT:    xor a1, a1, a4
+; RV32I-NEXT:    add t1, t1, t2
+; RV32I-NEXT:    neg a7, t0
+; RV32I-NEXT:    add t0, a3, a2
+; RV32I-NEXT:    sltu a3, a3, a4
 ; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    neg a2, t1
+; RV32I-NEXT:    sub a4, t0, a5
+; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    sltu a3, t0, a5
+; RV32I-NEXT:    neg a5, a4
+; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    snez a3, a4
+; RV32I-NEXT:    sltu a4, a5, a6
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    sub a3, a5, a6
 ; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a1, a1, t5
-; RV32I-NEXT:    sub a2, t4, t1
-; RV32I-NEXT:    add a3, a3, a7
-; RV32I-NEXT:    neg a3, a3
-; RV32I-NEXT:    neg a4, a5
-; RV32I-NEXT:    sw a4, 0(a0)
-; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a2, 8(a0)
+; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    sw a7, 0(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
+; RV32I-NEXT:    sw a3, 8(a0)
 ; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    ret
 ;
@@ -956,100 +956,100 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:  .LBB12_3:
 ; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    xor a2, a2, a1
-; RV64I-NEXT:    sltu a4, a2, a1
-; RV64I-NEXT:    xor a1, a3, a1
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    xor a3, a3, a1
+; RV64I-NEXT:    sltu a1, a2, a1
+; RV64I-NEXT:    add a3, a3, a0
 ; RV64I-NEXT:    add a0, a2, a0
-; RV64I-NEXT:    snez a2, a0
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    sub a3, a3, a1
+; RV64I-NEXT:    snez a1, a0
+; RV64I-NEXT:    add a1, a3, a1
 ; RV64I-NEXT:    neg a1, a1
 ; RV64I-NEXT:    neg a0, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: abd_ext_i128_undef:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a4, 0(a2)
-; RV32ZBB-NEXT:    lw a6, 4(a2)
-; RV32ZBB-NEXT:    lw t1, 8(a2)
-; RV32ZBB-NEXT:    lw a2, 12(a2)
-; RV32ZBB-NEXT:    lw a3, 8(a1)
-; RV32ZBB-NEXT:    lw a5, 12(a1)
-; RV32ZBB-NEXT:    lw a7, 0(a1)
+; RV32ZBB-NEXT:    lw a5, 0(a2)
+; RV32ZBB-NEXT:    lw a7, 4(a2)
+; RV32ZBB-NEXT:    lw a3, 8(a2)
+; RV32ZBB-NEXT:    lw t1, 12(a2)
+; RV32ZBB-NEXT:    lw a4, 8(a1)
+; RV32ZBB-NEXT:    lw a6, 12(a1)
+; RV32ZBB-NEXT:    lw a2, 0(a1)
 ; RV32ZBB-NEXT:    lw t0, 4(a1)
-; RV32ZBB-NEXT:    sltu a1, a3, t1
-; RV32ZBB-NEXT:    sub a2, a5, a2
-; RV32ZBB-NEXT:    sltu t2, a7, a4
-; RV32ZBB-NEXT:    sub a1, a2, a1
-; RV32ZBB-NEXT:    mv a2, t2
-; RV32ZBB-NEXT:    beq t0, a6, .LBB12_2
+; RV32ZBB-NEXT:    sltu a1, a4, a3
+; RV32ZBB-NEXT:    sub t1, a6, t1
+; RV32ZBB-NEXT:    sltu t2, a2, a5
+; RV32ZBB-NEXT:    sub a1, t1, a1
+; RV32ZBB-NEXT:    mv t1, t2
+; RV32ZBB-NEXT:    beq t0, a7, .LBB12_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    sltu a2, t0, a6
+; RV32ZBB-NEXT:    sltu t1, t0, a7
 ; RV32ZBB-NEXT:  .LBB12_2:
-; RV32ZBB-NEXT:    sub t1, a3, t1
-; RV32ZBB-NEXT:    sltu t3, t1, a2
+; RV32ZBB-NEXT:    sub a3, a4, a3
+; RV32ZBB-NEXT:    sltu t3, a3, t1
 ; RV32ZBB-NEXT:    sub a1, a1, t3
-; RV32ZBB-NEXT:    sub a2, t1, a2
-; RV32ZBB-NEXT:    beq a1, a5, .LBB12_4
+; RV32ZBB-NEXT:    sub a3, a3, t1
+; RV32ZBB-NEXT:    beq a1, a6, .LBB12_4
 ; RV32ZBB-NEXT:  # %bb.3:
-; RV32ZBB-NEXT:    sltu t1, a5, a1
+; RV32ZBB-NEXT:    sltu t1, a6, a1
 ; RV32ZBB-NEXT:    j .LBB12_5
 ; RV32ZBB-NEXT:  .LBB12_4:
-; RV32ZBB-NEXT:    sltu t1, a3, a2
+; RV32ZBB-NEXT:    sltu t1, a4, a3
 ; RV32ZBB-NEXT:  .LBB12_5:
-; RV32ZBB-NEXT:    sub a6, t0, a6
-; RV32ZBB-NEXT:    sub a6, a6, t2
-; RV32ZBB-NEXT:    sub t2, a7, a4
-; RV32ZBB-NEXT:    beq a6, t0, .LBB12_7
+; RV32ZBB-NEXT:    sub a7, t0, a7
+; RV32ZBB-NEXT:    sub a7, a7, t2
+; RV32ZBB-NEXT:    sub a5, a2, a5
+; RV32ZBB-NEXT:    beq a7, t0, .LBB12_7
 ; RV32ZBB-NEXT:  # %bb.6:
-; RV32ZBB-NEXT:    sltu a4, t0, a6
+; RV32ZBB-NEXT:    sltu a2, t0, a7
 ; RV32ZBB-NEXT:    j .LBB12_8
 ; RV32ZBB-NEXT:  .LBB12_7:
-; RV32ZBB-NEXT:    sltu a4, a7, t2
+; RV32ZBB-NEXT:    sltu a2, a2, a5
 ; RV32ZBB-NEXT:  .LBB12_8:
-; RV32ZBB-NEXT:    xor a5, a1, a5
-; RV32ZBB-NEXT:    xor a3, a2, a3
-; RV32ZBB-NEXT:    or a3, a3, a5
-; RV32ZBB-NEXT:    beqz a3, .LBB12_10
+; RV32ZBB-NEXT:    xor a6, a1, a6
+; RV32ZBB-NEXT:    xor a4, a3, a4
+; RV32ZBB-NEXT:    or a4, a4, a6
+; RV32ZBB-NEXT:    beqz a4, .LBB12_10
 ; RV32ZBB-NEXT:  # %bb.9:
-; RV32ZBB-NEXT:    mv a4, t1
+; RV32ZBB-NEXT:    mv a2, t1
 ; RV32ZBB-NEXT:  .LBB12_10:
-; RV32ZBB-NEXT:    neg t0, a4
-; RV32ZBB-NEXT:    xor a5, t2, t0
-; RV32ZBB-NEXT:    sltu t2, a5, t0
-; RV32ZBB-NEXT:    xor t3, a6, t0
-; RV32ZBB-NEXT:    add a3, t3, a4
-; RV32ZBB-NEXT:    sub a3, a3, t2
-; RV32ZBB-NEXT:    snez t1, a3
-; RV32ZBB-NEXT:    add a5, a5, a4
-; RV32ZBB-NEXT:    snez a7, a5
-; RV32ZBB-NEXT:    or t1, a7, t1
-; RV32ZBB-NEXT:    beqz a6, .LBB12_12
+; RV32ZBB-NEXT:    neg a4, a2
+; RV32ZBB-NEXT:    xor t0, a5, a4
+; RV32ZBB-NEXT:    xor t3, a7, a4
+; RV32ZBB-NEXT:    sltu a5, t0, a4
+; RV32ZBB-NEXT:    add a6, t3, a2
+; RV32ZBB-NEXT:    add t0, t0, a2
+; RV32ZBB-NEXT:    sub t1, a6, a5
+; RV32ZBB-NEXT:    snez a6, t1
+; RV32ZBB-NEXT:    snez t2, t0
+; RV32ZBB-NEXT:    or a6, t2, a6
+; RV32ZBB-NEXT:    beqz a7, .LBB12_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t2, t3, t0
+; RV32ZBB-NEXT:    sltu a5, t3, a4
 ; RV32ZBB-NEXT:  .LBB12_12:
-; RV32ZBB-NEXT:    xor a2, a2, t0
-; RV32ZBB-NEXT:    add a6, a2, a4
-; RV32ZBB-NEXT:    sub t3, a6, t2
-; RV32ZBB-NEXT:    neg t4, t3
-; RV32ZBB-NEXT:    sltu t5, t4, t1
-; RV32ZBB-NEXT:    sltu a2, a2, t0
-; RV32ZBB-NEXT:    xor a1, a1, t0
-; RV32ZBB-NEXT:    add a1, a1, a4
-; RV32ZBB-NEXT:    sub a1, a1, a2
-; RV32ZBB-NEXT:    sltu a2, a6, t2
-; RV32ZBB-NEXT:    sub a1, a1, a2
-; RV32ZBB-NEXT:    snez a2, t3
+; RV32ZBB-NEXT:    xor a3, a3, a4
+; RV32ZBB-NEXT:    xor a1, a1, a4
+; RV32ZBB-NEXT:    add t1, t1, t2
+; RV32ZBB-NEXT:    neg a7, t0
+; RV32ZBB-NEXT:    add t0, a3, a2
+; RV32ZBB-NEXT:    sltu a3, a3, a4
 ; RV32ZBB-NEXT:    add a1, a1, a2
+; RV32ZBB-NEXT:    neg a2, t1
+; RV32ZBB-NEXT:    sub a4, t0, a5
+; RV32ZBB-NEXT:    sub a1, a1, a3
+; RV32ZBB-NEXT:    sltu a3, t0, a5
+; RV32ZBB-NEXT:    neg a5, a4
+; RV32ZBB-NEXT:    sub a1, a1, a3
+; RV32ZBB-NEXT:    snez a3, a4
+; RV32ZBB-NEXT:    sltu a4, a5, a6
+; RV32ZBB-NEXT:    add a1, a1, a3
+; RV32ZBB-NEXT:    sub a3, a5, a6
 ; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a1, a1, t5
-; RV32ZBB-NEXT:    sub a2, t4, t1
-; RV32ZBB-NEXT:    add a3, a3, a7
-; RV32ZBB-NEXT:    neg a3, a3
-; RV32ZBB-NEXT:    neg a4, a5
-; RV32ZBB-NEXT:    sw a4, 0(a0)
-; RV32ZBB-NEXT:    sw a3, 4(a0)
-; RV32ZBB-NEXT:    sw a2, 8(a0)
+; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    sw a7, 0(a0)
+; RV32ZBB-NEXT:    sw a2, 4(a0)
+; RV32ZBB-NEXT:    sw a3, 8(a0)
 ; RV32ZBB-NEXT:    sw a1, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
@@ -1068,13 +1068,13 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV64ZBB-NEXT:  .LBB12_3:
 ; RV64ZBB-NEXT:    neg a1, a0
 ; RV64ZBB-NEXT:    xor a2, a2, a1
-; RV64ZBB-NEXT:    sltu a4, a2, a1
-; RV64ZBB-NEXT:    xor a1, a3, a1
-; RV64ZBB-NEXT:    add a1, a1, a0
-; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    xor a3, a3, a1
+; RV64ZBB-NEXT:    sltu a1, a2, a1
+; RV64ZBB-NEXT:    add a3, a3, a0
 ; RV64ZBB-NEXT:    add a0, a2, a0
-; RV64ZBB-NEXT:    snez a2, a0
-; RV64ZBB-NEXT:    add a1, a1, a2
+; RV64ZBB-NEXT:    sub a3, a3, a1
+; RV64ZBB-NEXT:    snez a1, a0
+; RV64ZBB-NEXT:    add a1, a3, a1
 ; RV64ZBB-NEXT:    neg a1, a1
 ; RV64ZBB-NEXT:    neg a0, a0
 ; RV64ZBB-NEXT:    ret
@@ -1402,26 +1402,26 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    mv a5, t0
 ; RV32I-NEXT:    mv a4, a7
 ; RV32I-NEXT:  .LBB17_19:
-; RV32I-NEXT:    sltu a6, t3, a4
-; RV32I-NEXT:    sub a7, t4, a5
-; RV32I-NEXT:    sltu a5, a2, a1
-; RV32I-NEXT:    sub a6, a7, a6
-; RV32I-NEXT:    mv a7, a5
+; RV32I-NEXT:    sltu a7, t3, a4
+; RV32I-NEXT:    sub a5, t4, a5
+; RV32I-NEXT:    sltu a6, a2, a1
+; RV32I-NEXT:    sub a5, a5, a7
+; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beq t1, a3, .LBB17_21
 ; RV32I-NEXT:  # %bb.20:
 ; RV32I-NEXT:    sltu a7, t1, a3
 ; RV32I-NEXT:  .LBB17_21:
 ; RV32I-NEXT:    sub a4, t3, a4
-; RV32I-NEXT:    sltu t0, a4, a7
-; RV32I-NEXT:    sub a6, a6, t0
-; RV32I-NEXT:    sub a4, a4, a7
 ; RV32I-NEXT:    sub a3, t1, a3
-; RV32I-NEXT:    sub a3, a3, a5
 ; RV32I-NEXT:    sub a2, a2, a1
+; RV32I-NEXT:    sltu a1, a4, a7
+; RV32I-NEXT:    sub a4, a4, a7
+; RV32I-NEXT:    sub a3, a3, a6
+; RV32I-NEXT:    sub a5, a5, a1
 ; RV32I-NEXT:    sw a2, 0(a0)
 ; RV32I-NEXT:    sw a3, 4(a0)
 ; RV32I-NEXT:    sw a4, 8(a0)
-; RV32I-NEXT:    sw a6, 12(a0)
+; RV32I-NEXT:    sw a5, 12(a0)
 ; RV32I-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 16
 ; RV32I-NEXT:    ret
@@ -1529,26 +1529,26 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:    mv a5, t0
 ; RV32ZBB-NEXT:    mv a4, a7
 ; RV32ZBB-NEXT:  .LBB17_19:
-; RV32ZBB-NEXT:    sltu a6, t3, a4
-; RV32ZBB-NEXT:    sub a7, t4, a5
-; RV32ZBB-NEXT:    sltu a5, a2, a1
-; RV32ZBB-NEXT:    sub a6, a7, a6
-; RV32ZBB-NEXT:    mv a7, a5
+; RV32ZBB-NEXT:    sltu a7, t3, a4
+; RV32ZBB-NEXT:    sub a5, t4, a5
+; RV32ZBB-NEXT:    sltu a6, a2, a1
+; RV32ZBB-NEXT:    sub a5, a5, a7
+; RV32ZBB-NEXT:    mv a7, a6
 ; RV32ZBB-NEXT:    beq t1, a3, .LBB17_21
 ; RV32ZBB-NEXT:  # %bb.20:
 ; RV32ZBB-NEXT:    sltu a7, t1, a3
 ; RV32ZBB-NEXT:  .LBB17_21:
 ; RV32ZBB-NEXT:    sub a4, t3, a4
-; RV32ZBB-NEXT:    sltu t0, a4, a7
-; RV32ZBB-NEXT:    sub a6, a6, t0
-; RV32ZBB-NEXT:    sub a4, a4, a7
 ; RV32ZBB-NEXT:    sub a3, t1, a3
-; RV32ZBB-NEXT:    sub a3, a3, a5
 ; RV32ZBB-NEXT:    sub a2, a2, a1
+; RV32ZBB-NEXT:    sltu a1, a4, a7
+; RV32ZBB-NEXT:    sub a4, a4, a7
+; RV32ZBB-NEXT:    sub a3, a3, a6
+; RV32ZBB-NEXT:    sub a5, a5, a1
 ; RV32ZBB-NEXT:    sw a2, 0(a0)
 ; RV32ZBB-NEXT:    sw a3, 4(a0)
 ; RV32ZBB-NEXT:    sw a4, 8(a0)
-; RV32ZBB-NEXT:    sw a6, 12(a0)
+; RV32ZBB-NEXT:    sw a5, 12(a0)
 ; RV32ZBB-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBB-NEXT:    addi sp, sp, 16
 ; RV32ZBB-NEXT:    ret
@@ -1835,30 +1835,30 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  # %bb.9:
 ; RV32I-NEXT:    sltu t1, a5, a6
 ; RV32I-NEXT:    sub a7, a7, t0
-; RV32I-NEXT:    sub a7, a7, t1
-; RV32I-NEXT:    sub a6, a5, a6
-; RV32I-NEXT:    sltu a5, a6, t5
-; RV32I-NEXT:    sub a5, a7, a5
-; RV32I-NEXT:    sub a6, a6, t5
+; RV32I-NEXT:    sub a5, a5, a6
 ; RV32I-NEXT:    sub a4, a4, a1
-; RV32I-NEXT:    sub a1, a4, t4
+; RV32I-NEXT:    sub a6, a7, t1
+; RV32I-NEXT:    sltu a7, a5, t5
+; RV32I-NEXT:    sub a1, a5, t5
+; RV32I-NEXT:    sub a5, a4, t4
+; RV32I-NEXT:    sub a4, a6, a7
 ; RV32I-NEXT:    sub a2, a3, a2
 ; RV32I-NEXT:    j .LBB22_11
 ; RV32I-NEXT:  .LBB22_10:
 ; RV32I-NEXT:    sub a7, t0, a7
-; RV32I-NEXT:    sub a6, a6, a5
-; RV32I-NEXT:    sub a5, a7, t1
-; RV32I-NEXT:    sltu a7, a6, t3
-; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    sub a5, a5, a7
-; RV32I-NEXT:    sub a6, a6, t3
-; RV32I-NEXT:    sub a1, a1, t2
+; RV32I-NEXT:    sub a5, a6, a5
+; RV32I-NEXT:    sub a4, a1, a4
+; RV32I-NEXT:    sub a6, a7, t1
+; RV32I-NEXT:    sltu a7, a5, t3
+; RV32I-NEXT:    sub a1, a5, t3
+; RV32I-NEXT:    sub a5, a4, t2
+; RV32I-NEXT:    sub a4, a6, a7
 ; RV32I-NEXT:    sub a2, a2, a3
 ; RV32I-NEXT:  .LBB22_11:
 ; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a1, 4(a0)
-; RV32I-NEXT:    sw a6, 8(a0)
-; RV32I-NEXT:    sw a5, 12(a0)
+; RV32I-NEXT:    sw a5, 4(a0)
+; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    sw a4, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_cmp_i128:
@@ -1922,30 +1922,30 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  # %bb.9:
 ; RV32ZBB-NEXT:    sltu t1, a5, a6
 ; RV32ZBB-NEXT:    sub a7, a7, t0
-; RV32ZBB-NEXT:    sub a7, a7, t1
-; RV32ZBB-NEXT:    sub a6, a5, a6
-; RV32ZBB-NEXT:    sltu a5, a6, t5
-; RV32ZBB-NEXT:    sub a5, a7, a5
-; RV32ZBB-NEXT:    sub a6, a6, t5
+; RV32ZBB-NEXT:    sub a5, a5, a6
 ; RV32ZBB-NEXT:    sub a4, a4, a1
-; RV32ZBB-NEXT:    sub a1, a4, t4
+; RV32ZBB-NEXT:    sub a6, a7, t1
+; RV32ZBB-NEXT:    sltu a7, a5, t5
+; RV32ZBB-NEXT:    sub a1, a5, t5
+; RV32ZBB-NEXT:    sub a5, a4, t4
+; RV32ZBB-NEXT:    sub a4, a6, a7
 ; RV32ZBB-NEXT:    sub a2, a3, a2
 ; RV32ZBB-NEXT:    j .LBB22_11
 ; RV32ZBB-NEXT:  .LBB22_10:
 ; RV32ZBB-NEXT:    sub a7, t0, a7
-; RV32ZBB-NEXT:    sub a6, a6, a5
-; RV32ZBB-NEXT:    sub a5, a7, t1
-; RV32ZBB-NEXT:    sltu a7, a6, t3
-; RV32ZBB-NEXT:    sub a1, a1, a4
-; RV32ZBB-NEXT:    sub a5, a5, a7
-; RV32ZBB-NEXT:    sub a6, a6, t3
-; RV32ZBB-NEXT:    sub a1, a1, t2
+; RV32ZBB-NEXT:    sub a5, a6, a5
+; RV32ZBB-NEXT:    sub a4, a1, a4
+; RV32ZBB-NEXT:    sub a6, a7, t1
+; RV32ZBB-NEXT:    sltu a7, a5, t3
+; RV32ZBB-NEXT:    sub a1, a5, t3
+; RV32ZBB-NEXT:    sub a5, a4, t2
+; RV32ZBB-NEXT:    sub a4, a6, a7
 ; RV32ZBB-NEXT:    sub a2, a2, a3
 ; RV32ZBB-NEXT:  .LBB22_11:
 ; RV32ZBB-NEXT:    sw a2, 0(a0)
-; RV32ZBB-NEXT:    sw a1, 4(a0)
-; RV32ZBB-NEXT:    sw a6, 8(a0)
-; RV32ZBB-NEXT:    sw a5, 12(a0)
+; RV32ZBB-NEXT:    sw a5, 4(a0)
+; RV32ZBB-NEXT:    sw a1, 8(a0)
+; RV32ZBB-NEXT:    sw a4, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_cmp_i128:

diff  --git a/llvm/test/CodeGen/RISCV/abdu.ll b/llvm/test/CodeGen/RISCV/abdu.ll
index 814bca98523ce8..7c8638cb461e26 100644
--- a/llvm/test/CodeGen/RISCV/abdu.ll
+++ b/llvm/test/CodeGen/RISCV/abdu.ll
@@ -178,8 +178,8 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i16_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -198,8 +198,8 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
 ; RV64ZBB-LABEL: abd_ext_i16_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a1, a1, 32
-; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    minu a2, a0, a1
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
 ; RV64ZBB-NEXT:    sub a0, a0, a2
@@ -267,8 +267,8 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -286,8 +286,8 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind {
 ; RV64ZBB-LABEL: abd_ext_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a1, a1, 32
-; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    slli a0, a0, 32
+; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    minu a2, a0, a1
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
@@ -317,8 +317,8 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i32_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a1, 48
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -337,8 +337,8 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind {
 ; RV64ZBB-LABEL: abd_ext_i32_i16:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a0, a0, 32
-; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    zext.h a1, a1
+; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    minu a2, a0, a1
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
 ; RV64ZBB-NEXT:    sub a0, a0, a2
@@ -365,8 +365,8 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 ; RV64I-LABEL: abd_ext_i32_undef:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -384,8 +384,8 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind {
 ; RV64ZBB-LABEL: abd_ext_i32_undef:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a1, a1, 32
-; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    slli a0, a0, 32
+; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    minu a2, a0, a1
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
@@ -415,10 +415,10 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:  .LBB9_3:
 ; RV32I-NEXT:    neg a1, a0
 ; RV32I-NEXT:    xor a2, a2, a1
-; RV32I-NEXT:    sltu a4, a2, a1
-; RV32I-NEXT:    xor a1, a3, a1
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    xor a3, a3, a1
+; RV32I-NEXT:    sltu a1, a2, a1
+; RV32I-NEXT:    add a3, a3, a0
+; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
@@ -447,10 +447,10 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB9_3:
 ; RV32ZBB-NEXT:    neg a1, a0
 ; RV32ZBB-NEXT:    xor a2, a2, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a1
-; RV32ZBB-NEXT:    xor a1, a3, a1
-; RV32ZBB-NEXT:    add a1, a1, a0
-; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    xor a3, a3, a1
+; RV32ZBB-NEXT:    sltu a1, a2, a1
+; RV32ZBB-NEXT:    add a3, a3, a0
+; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:    add a0, a2, a0
 ; RV32ZBB-NEXT:    ret
 ;
@@ -484,10 +484,10 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:  .LBB10_3:
 ; RV32I-NEXT:    neg a1, a0
 ; RV32I-NEXT:    xor a2, a2, a1
-; RV32I-NEXT:    sltu a4, a2, a1
-; RV32I-NEXT:    xor a1, a3, a1
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    xor a3, a3, a1
+; RV32I-NEXT:    sltu a1, a2, a1
+; RV32I-NEXT:    add a3, a3, a0
+; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
@@ -516,10 +516,10 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB10_3:
 ; RV32ZBB-NEXT:    neg a1, a0
 ; RV32ZBB-NEXT:    xor a2, a2, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a1
-; RV32ZBB-NEXT:    xor a1, a3, a1
-; RV32ZBB-NEXT:    add a1, a1, a0
-; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    xor a3, a3, a1
+; RV32ZBB-NEXT:    sltu a1, a2, a1
+; RV32ZBB-NEXT:    add a3, a3, a0
+; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:    add a0, a2, a0
 ; RV32ZBB-NEXT:    ret
 ;
@@ -587,29 +587,29 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  .LBB11_10:
 ; RV32I-NEXT:    neg t0, a1
 ; RV32I-NEXT:    xor a2, a7, t0
-; RV32I-NEXT:    sltu a4, a2, t0
 ; RV32I-NEXT:    xor a6, a6, t0
-; RV32I-NEXT:    add a6, a6, a1
-; RV32I-NEXT:    sub a4, a6, a4
-; RV32I-NEXT:    xor a3, a3, t0
-; RV32I-NEXT:    sltu a6, a3, t0
-; RV32I-NEXT:    xor a7, a5, t0
-; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    xor a4, a3, t0
+; RV32I-NEXT:    sltu a3, a2, t0
+; RV32I-NEXT:    add a7, a6, a1
+; RV32I-NEXT:    sltu a6, a4, t0
+; RV32I-NEXT:    sub a3, a7, a3
+; RV32I-NEXT:    xor t1, a5, t0
+; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beqz a5, .LBB11_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t1, a7, t0
+; RV32I-NEXT:    sltu a7, t1, t0
 ; RV32I-NEXT:  .LBB11_12:
 ; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    sltu a5, a2, t1
-; RV32I-NEXT:    sub a4, a4, a5
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    add a7, a7, a1
-; RV32I-NEXT:    sub a5, a7, a6
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add t1, t1, a1
+; RV32I-NEXT:    add a1, a4, a1
+; RV32I-NEXT:    sltu a4, a2, a7
+; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a5, t1, a6
+; RV32I-NEXT:    sub a3, a3, a4
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_ext_i128:
@@ -627,10 +627,10 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:  .LBB11_3:
 ; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    xor a2, a2, a1
-; RV64I-NEXT:    sltu a4, a2, a1
-; RV64I-NEXT:    xor a1, a3, a1
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    xor a3, a3, a1
+; RV64I-NEXT:    sltu a1, a2, a1
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    sub a1, a3, a1
 ; RV64I-NEXT:    add a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
@@ -683,29 +683,29 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB11_10:
 ; RV32ZBB-NEXT:    neg t0, a1
 ; RV32ZBB-NEXT:    xor a2, a7, t0
-; RV32ZBB-NEXT:    sltu a4, a2, t0
 ; RV32ZBB-NEXT:    xor a6, a6, t0
-; RV32ZBB-NEXT:    add a6, a6, a1
-; RV32ZBB-NEXT:    sub a4, a6, a4
-; RV32ZBB-NEXT:    xor a3, a3, t0
-; RV32ZBB-NEXT:    sltu a6, a3, t0
-; RV32ZBB-NEXT:    xor a7, a5, t0
-; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    xor a4, a3, t0
+; RV32ZBB-NEXT:    sltu a3, a2, t0
+; RV32ZBB-NEXT:    add a7, a6, a1
+; RV32ZBB-NEXT:    sltu a6, a4, t0
+; RV32ZBB-NEXT:    sub a3, a7, a3
+; RV32ZBB-NEXT:    xor t1, a5, t0
+; RV32ZBB-NEXT:    mv a7, a6
 ; RV32ZBB-NEXT:    beqz a5, .LBB11_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t1, a7, t0
+; RV32ZBB-NEXT:    sltu a7, t1, t0
 ; RV32ZBB-NEXT:  .LBB11_12:
 ; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    sltu a5, a2, t1
-; RV32ZBB-NEXT:    sub a4, a4, a5
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    add a7, a7, a1
-; RV32ZBB-NEXT:    sub a5, a7, a6
-; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    add t1, t1, a1
+; RV32ZBB-NEXT:    add a1, a4, a1
+; RV32ZBB-NEXT:    sltu a4, a2, a7
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a5, t1, a6
+; RV32ZBB-NEXT:    sub a3, a3, a4
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a4, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_ext_i128:
@@ -723,10 +723,10 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; RV64ZBB-NEXT:  .LBB11_3:
 ; RV64ZBB-NEXT:    neg a1, a0
 ; RV64ZBB-NEXT:    xor a2, a2, a1
-; RV64ZBB-NEXT:    sltu a4, a2, a1
-; RV64ZBB-NEXT:    xor a1, a3, a1
-; RV64ZBB-NEXT:    add a1, a1, a0
-; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    xor a3, a3, a1
+; RV64ZBB-NEXT:    sltu a1, a2, a1
+; RV64ZBB-NEXT:    add a3, a3, a0
+; RV64ZBB-NEXT:    sub a1, a3, a1
 ; RV64ZBB-NEXT:    add a0, a2, a0
 ; RV64ZBB-NEXT:    ret
   %aext = zext i128 %a to i256
@@ -787,29 +787,29 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  .LBB12_10:
 ; RV32I-NEXT:    neg t0, a1
 ; RV32I-NEXT:    xor a2, a7, t0
-; RV32I-NEXT:    sltu a4, a2, t0
 ; RV32I-NEXT:    xor a6, a6, t0
-; RV32I-NEXT:    add a6, a6, a1
-; RV32I-NEXT:    sub a4, a6, a4
-; RV32I-NEXT:    xor a3, a3, t0
-; RV32I-NEXT:    sltu a6, a3, t0
-; RV32I-NEXT:    xor a7, a5, t0
-; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    xor a4, a3, t0
+; RV32I-NEXT:    sltu a3, a2, t0
+; RV32I-NEXT:    add a7, a6, a1
+; RV32I-NEXT:    sltu a6, a4, t0
+; RV32I-NEXT:    sub a3, a7, a3
+; RV32I-NEXT:    xor t1, a5, t0
+; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beqz a5, .LBB12_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t1, a7, t0
+; RV32I-NEXT:    sltu a7, t1, t0
 ; RV32I-NEXT:  .LBB12_12:
 ; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    sltu a5, a2, t1
-; RV32I-NEXT:    sub a4, a4, a5
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    add a7, a7, a1
-; RV32I-NEXT:    sub a5, a7, a6
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add t1, t1, a1
+; RV32I-NEXT:    add a1, a4, a1
+; RV32I-NEXT:    sltu a4, a2, a7
+; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a5, t1, a6
+; RV32I-NEXT:    sub a3, a3, a4
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_ext_i128_undef:
@@ -827,10 +827,10 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:  .LBB12_3:
 ; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    xor a2, a2, a1
-; RV64I-NEXT:    sltu a4, a2, a1
-; RV64I-NEXT:    xor a1, a3, a1
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    xor a3, a3, a1
+; RV64I-NEXT:    sltu a1, a2, a1
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    sub a1, a3, a1
 ; RV64I-NEXT:    add a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
@@ -883,29 +883,29 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB12_10:
 ; RV32ZBB-NEXT:    neg t0, a1
 ; RV32ZBB-NEXT:    xor a2, a7, t0
-; RV32ZBB-NEXT:    sltu a4, a2, t0
 ; RV32ZBB-NEXT:    xor a6, a6, t0
-; RV32ZBB-NEXT:    add a6, a6, a1
-; RV32ZBB-NEXT:    sub a4, a6, a4
-; RV32ZBB-NEXT:    xor a3, a3, t0
-; RV32ZBB-NEXT:    sltu a6, a3, t0
-; RV32ZBB-NEXT:    xor a7, a5, t0
-; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    xor a4, a3, t0
+; RV32ZBB-NEXT:    sltu a3, a2, t0
+; RV32ZBB-NEXT:    add a7, a6, a1
+; RV32ZBB-NEXT:    sltu a6, a4, t0
+; RV32ZBB-NEXT:    sub a3, a7, a3
+; RV32ZBB-NEXT:    xor t1, a5, t0
+; RV32ZBB-NEXT:    mv a7, a6
 ; RV32ZBB-NEXT:    beqz a5, .LBB12_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t1, a7, t0
+; RV32ZBB-NEXT:    sltu a7, t1, t0
 ; RV32ZBB-NEXT:  .LBB12_12:
 ; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    sltu a5, a2, t1
-; RV32ZBB-NEXT:    sub a4, a4, a5
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    add a7, a7, a1
-; RV32ZBB-NEXT:    sub a5, a7, a6
-; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    add t1, t1, a1
+; RV32ZBB-NEXT:    add a1, a4, a1
+; RV32ZBB-NEXT:    sltu a4, a2, a7
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a5, t1, a6
+; RV32ZBB-NEXT:    sub a3, a3, a4
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a4, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_ext_i128_undef:
@@ -923,10 +923,10 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; RV64ZBB-NEXT:  .LBB12_3:
 ; RV64ZBB-NEXT:    neg a1, a0
 ; RV64ZBB-NEXT:    xor a2, a2, a1
-; RV64ZBB-NEXT:    sltu a4, a2, a1
-; RV64ZBB-NEXT:    xor a1, a3, a1
-; RV64ZBB-NEXT:    add a1, a1, a0
-; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    xor a3, a3, a1
+; RV64ZBB-NEXT:    sltu a1, a2, a1
+; RV64ZBB-NEXT:    add a3, a3, a0
+; RV64ZBB-NEXT:    sub a1, a3, a1
 ; RV64ZBB-NEXT:    add a0, a2, a0
 ; RV64ZBB-NEXT:    ret
   %aext = zext i128 %a to i256
@@ -1029,8 +1029,8 @@ define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind {
 ; RV64I-LABEL: abd_minmax_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -1048,8 +1048,8 @@ define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind {
 ; RV64ZBB-LABEL: abd_minmax_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a1, a1, 32
-; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    slli a0, a0, 32
+; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    minu a2, a0, a1
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
@@ -1077,10 +1077,10 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:  .LBB16_3:
 ; RV32I-NEXT:    neg a1, a0
 ; RV32I-NEXT:    xor a2, a2, a1
-; RV32I-NEXT:    sltu a4, a2, a1
-; RV32I-NEXT:    xor a1, a3, a1
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    xor a3, a3, a1
+; RV32I-NEXT:    sltu a1, a2, a1
+; RV32I-NEXT:    add a3, a3, a0
+; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
@@ -1109,10 +1109,10 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB16_3:
 ; RV32ZBB-NEXT:    neg a1, a0
 ; RV32ZBB-NEXT:    xor a2, a2, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a1
-; RV32ZBB-NEXT:    xor a1, a3, a1
-; RV32ZBB-NEXT:    add a1, a1, a0
-; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    xor a3, a3, a1
+; RV32ZBB-NEXT:    sltu a1, a2, a1
+; RV32ZBB-NEXT:    add a3, a3, a0
+; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:    add a0, a2, a0
 ; RV32ZBB-NEXT:    ret
 ;
@@ -1178,29 +1178,29 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  .LBB17_10:
 ; RV32I-NEXT:    neg t0, a1
 ; RV32I-NEXT:    xor a2, a7, t0
-; RV32I-NEXT:    sltu a4, a2, t0
 ; RV32I-NEXT:    xor a6, a6, t0
-; RV32I-NEXT:    add a6, a6, a1
-; RV32I-NEXT:    sub a4, a6, a4
-; RV32I-NEXT:    xor a3, a3, t0
-; RV32I-NEXT:    sltu a6, a3, t0
-; RV32I-NEXT:    xor a7, a5, t0
-; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    xor a4, a3, t0
+; RV32I-NEXT:    sltu a3, a2, t0
+; RV32I-NEXT:    add a7, a6, a1
+; RV32I-NEXT:    sltu a6, a4, t0
+; RV32I-NEXT:    sub a3, a7, a3
+; RV32I-NEXT:    xor t1, a5, t0
+; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beqz a5, .LBB17_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t1, a7, t0
+; RV32I-NEXT:    sltu a7, t1, t0
 ; RV32I-NEXT:  .LBB17_12:
 ; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    sltu a5, a2, t1
-; RV32I-NEXT:    sub a4, a4, a5
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    add a7, a7, a1
-; RV32I-NEXT:    sub a5, a7, a6
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add t1, t1, a1
+; RV32I-NEXT:    add a1, a4, a1
+; RV32I-NEXT:    sltu a4, a2, a7
+; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a5, t1, a6
+; RV32I-NEXT:    sub a3, a3, a4
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_minmax_i128:
@@ -1218,10 +1218,10 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:  .LBB17_3:
 ; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    xor a2, a2, a1
-; RV64I-NEXT:    sltu a4, a2, a1
-; RV64I-NEXT:    xor a1, a3, a1
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    xor a3, a3, a1
+; RV64I-NEXT:    sltu a1, a2, a1
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    sub a1, a3, a1
 ; RV64I-NEXT:    add a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
@@ -1274,29 +1274,29 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB17_10:
 ; RV32ZBB-NEXT:    neg t0, a1
 ; RV32ZBB-NEXT:    xor a2, a7, t0
-; RV32ZBB-NEXT:    sltu a4, a2, t0
 ; RV32ZBB-NEXT:    xor a6, a6, t0
-; RV32ZBB-NEXT:    add a6, a6, a1
-; RV32ZBB-NEXT:    sub a4, a6, a4
-; RV32ZBB-NEXT:    xor a3, a3, t0
-; RV32ZBB-NEXT:    sltu a6, a3, t0
-; RV32ZBB-NEXT:    xor a7, a5, t0
-; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    xor a4, a3, t0
+; RV32ZBB-NEXT:    sltu a3, a2, t0
+; RV32ZBB-NEXT:    add a7, a6, a1
+; RV32ZBB-NEXT:    sltu a6, a4, t0
+; RV32ZBB-NEXT:    sub a3, a7, a3
+; RV32ZBB-NEXT:    xor t1, a5, t0
+; RV32ZBB-NEXT:    mv a7, a6
 ; RV32ZBB-NEXT:    beqz a5, .LBB17_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t1, a7, t0
+; RV32ZBB-NEXT:    sltu a7, t1, t0
 ; RV32ZBB-NEXT:  .LBB17_12:
 ; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    sltu a5, a2, t1
-; RV32ZBB-NEXT:    sub a4, a4, a5
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    add a7, a7, a1
-; RV32ZBB-NEXT:    sub a5, a7, a6
-; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    add t1, t1, a1
+; RV32ZBB-NEXT:    add a1, a4, a1
+; RV32ZBB-NEXT:    sltu a4, a2, a7
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a5, t1, a6
+; RV32ZBB-NEXT:    sub a3, a3, a4
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a4, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_minmax_i128:
@@ -1314,10 +1314,10 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; RV64ZBB-NEXT:  .LBB17_3:
 ; RV64ZBB-NEXT:    neg a1, a0
 ; RV64ZBB-NEXT:    xor a2, a2, a1
-; RV64ZBB-NEXT:    sltu a4, a2, a1
-; RV64ZBB-NEXT:    xor a1, a3, a1
-; RV64ZBB-NEXT:    add a1, a1, a0
-; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    xor a3, a3, a1
+; RV64ZBB-NEXT:    sltu a1, a2, a1
+; RV64ZBB-NEXT:    add a3, a3, a0
+; RV64ZBB-NEXT:    sub a1, a3, a1
 ; RV64ZBB-NEXT:    add a0, a2, a0
 ; RV64ZBB-NEXT:    ret
   %min = call i128 @llvm.umin.i128(i128 %a, i128 %b)
@@ -1420,8 +1420,8 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 ; RV64I-LABEL: abd_cmp_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -1439,8 +1439,8 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind {
 ; RV64ZBB-LABEL: abd_cmp_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a1, a1, 32
-; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    slli a0, a0, 32
+; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    minu a2, a0, a1
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
@@ -1469,10 +1469,10 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:  .LBB21_3:
 ; RV32I-NEXT:    neg a1, a0
 ; RV32I-NEXT:    xor a2, a2, a1
-; RV32I-NEXT:    sltu a4, a2, a1
-; RV32I-NEXT:    xor a1, a3, a1
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    xor a3, a3, a1
+; RV32I-NEXT:    sltu a1, a2, a1
+; RV32I-NEXT:    add a3, a3, a0
+; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
@@ -1501,10 +1501,10 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB21_3:
 ; RV32ZBB-NEXT:    neg a1, a0
 ; RV32ZBB-NEXT:    xor a2, a2, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a1
-; RV32ZBB-NEXT:    xor a1, a3, a1
-; RV32ZBB-NEXT:    add a1, a1, a0
-; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    xor a3, a3, a1
+; RV32ZBB-NEXT:    sltu a1, a2, a1
+; RV32ZBB-NEXT:    add a3, a3, a0
+; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:    add a0, a2, a0
 ; RV32ZBB-NEXT:    ret
 ;
@@ -1571,29 +1571,29 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  .LBB22_10:
 ; RV32I-NEXT:    neg t0, a1
 ; RV32I-NEXT:    xor a2, a7, t0
-; RV32I-NEXT:    sltu a4, a2, t0
 ; RV32I-NEXT:    xor a6, a6, t0
-; RV32I-NEXT:    add a6, a6, a1
-; RV32I-NEXT:    sub a4, a6, a4
-; RV32I-NEXT:    xor a3, a3, t0
-; RV32I-NEXT:    sltu a6, a3, t0
-; RV32I-NEXT:    xor a7, a5, t0
-; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    xor a4, a3, t0
+; RV32I-NEXT:    sltu a3, a2, t0
+; RV32I-NEXT:    add a7, a6, a1
+; RV32I-NEXT:    sltu a6, a4, t0
+; RV32I-NEXT:    sub a3, a7, a3
+; RV32I-NEXT:    xor t1, a5, t0
+; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beqz a5, .LBB22_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t1, a7, t0
+; RV32I-NEXT:    sltu a7, t1, t0
 ; RV32I-NEXT:  .LBB22_12:
 ; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    sltu a5, a2, t1
-; RV32I-NEXT:    sub a4, a4, a5
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    add a7, a7, a1
-; RV32I-NEXT:    sub a5, a7, a6
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add t1, t1, a1
+; RV32I-NEXT:    add a1, a4, a1
+; RV32I-NEXT:    sltu a4, a2, a7
+; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a5, t1, a6
+; RV32I-NEXT:    sub a3, a3, a4
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_cmp_i128:
@@ -1611,10 +1611,10 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:  .LBB22_3:
 ; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    xor a2, a2, a1
-; RV64I-NEXT:    sltu a4, a2, a1
-; RV64I-NEXT:    xor a1, a3, a1
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    xor a3, a3, a1
+; RV64I-NEXT:    sltu a1, a2, a1
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    sub a1, a3, a1
 ; RV64I-NEXT:    add a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
@@ -1667,29 +1667,29 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB22_10:
 ; RV32ZBB-NEXT:    neg t0, a1
 ; RV32ZBB-NEXT:    xor a2, a7, t0
-; RV32ZBB-NEXT:    sltu a4, a2, t0
 ; RV32ZBB-NEXT:    xor a6, a6, t0
-; RV32ZBB-NEXT:    add a6, a6, a1
-; RV32ZBB-NEXT:    sub a4, a6, a4
-; RV32ZBB-NEXT:    xor a3, a3, t0
-; RV32ZBB-NEXT:    sltu a6, a3, t0
-; RV32ZBB-NEXT:    xor a7, a5, t0
-; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    xor a4, a3, t0
+; RV32ZBB-NEXT:    sltu a3, a2, t0
+; RV32ZBB-NEXT:    add a7, a6, a1
+; RV32ZBB-NEXT:    sltu a6, a4, t0
+; RV32ZBB-NEXT:    sub a3, a7, a3
+; RV32ZBB-NEXT:    xor t1, a5, t0
+; RV32ZBB-NEXT:    mv a7, a6
 ; RV32ZBB-NEXT:    beqz a5, .LBB22_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t1, a7, t0
+; RV32ZBB-NEXT:    sltu a7, t1, t0
 ; RV32ZBB-NEXT:  .LBB22_12:
 ; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    sltu a5, a2, t1
-; RV32ZBB-NEXT:    sub a4, a4, a5
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    add a7, a7, a1
-; RV32ZBB-NEXT:    sub a5, a7, a6
-; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    add t1, t1, a1
+; RV32ZBB-NEXT:    add a1, a4, a1
+; RV32ZBB-NEXT:    sltu a4, a2, a7
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a5, t1, a6
+; RV32ZBB-NEXT:    sub a3, a3, a4
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a4, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_cmp_i128:
@@ -1707,10 +1707,10 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; RV64ZBB-NEXT:  .LBB22_3:
 ; RV64ZBB-NEXT:    neg a1, a0
 ; RV64ZBB-NEXT:    xor a2, a2, a1
-; RV64ZBB-NEXT:    sltu a4, a2, a1
-; RV64ZBB-NEXT:    xor a1, a3, a1
-; RV64ZBB-NEXT:    add a1, a1, a0
-; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    xor a3, a3, a1
+; RV64ZBB-NEXT:    sltu a1, a2, a1
+; RV64ZBB-NEXT:    add a3, a3, a0
+; RV64ZBB-NEXT:    sub a1, a3, a1
 ; RV64ZBB-NEXT:    add a0, a2, a0
 ; RV64ZBB-NEXT:    ret
   %cmp = icmp uge i128 %a, %b
@@ -1814,8 +1814,8 @@ define i32 @abd_select_i32(i32 %a, i32 %b) nounwind {
 ; RV64I-LABEL: abd_select_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    sub a0, a0, a1
 ; RV64I-NEXT:    srai a1, a0, 63
@@ -1833,8 +1833,8 @@ define i32 @abd_select_i32(i32 %a, i32 %b) nounwind {
 ; RV64ZBB-LABEL: abd_select_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    slli a1, a1, 32
-; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    slli a0, a0, 32
+; RV64ZBB-NEXT:    srli a1, a1, 32
 ; RV64ZBB-NEXT:    srli a0, a0, 32
 ; RV64ZBB-NEXT:    minu a2, a0, a1
 ; RV64ZBB-NEXT:    maxu a0, a0, a1
@@ -1863,10 +1863,10 @@ define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:  .LBB26_3:
 ; RV32I-NEXT:    neg a1, a0
 ; RV32I-NEXT:    xor a2, a2, a1
-; RV32I-NEXT:    sltu a4, a2, a1
-; RV32I-NEXT:    xor a1, a3, a1
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    xor a3, a3, a1
+; RV32I-NEXT:    sltu a1, a2, a1
+; RV32I-NEXT:    add a3, a3, a0
+; RV32I-NEXT:    sub a1, a3, a1
 ; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
@@ -1895,10 +1895,10 @@ define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB26_3:
 ; RV32ZBB-NEXT:    neg a1, a0
 ; RV32ZBB-NEXT:    xor a2, a2, a1
-; RV32ZBB-NEXT:    sltu a4, a2, a1
-; RV32ZBB-NEXT:    xor a1, a3, a1
-; RV32ZBB-NEXT:    add a1, a1, a0
-; RV32ZBB-NEXT:    sub a1, a1, a4
+; RV32ZBB-NEXT:    xor a3, a3, a1
+; RV32ZBB-NEXT:    sltu a1, a2, a1
+; RV32ZBB-NEXT:    add a3, a3, a0
+; RV32ZBB-NEXT:    sub a1, a3, a1
 ; RV32ZBB-NEXT:    add a0, a2, a0
 ; RV32ZBB-NEXT:    ret
 ;
@@ -1965,29 +1965,29 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:  .LBB27_10:
 ; RV32I-NEXT:    neg t0, a1
 ; RV32I-NEXT:    xor a2, a7, t0
-; RV32I-NEXT:    sltu a4, a2, t0
 ; RV32I-NEXT:    xor a6, a6, t0
-; RV32I-NEXT:    add a6, a6, a1
-; RV32I-NEXT:    sub a4, a6, a4
-; RV32I-NEXT:    xor a3, a3, t0
-; RV32I-NEXT:    sltu a6, a3, t0
-; RV32I-NEXT:    xor a7, a5, t0
-; RV32I-NEXT:    mv t1, a6
+; RV32I-NEXT:    xor a4, a3, t0
+; RV32I-NEXT:    sltu a3, a2, t0
+; RV32I-NEXT:    add a7, a6, a1
+; RV32I-NEXT:    sltu a6, a4, t0
+; RV32I-NEXT:    sub a3, a7, a3
+; RV32I-NEXT:    xor t1, a5, t0
+; RV32I-NEXT:    mv a7, a6
 ; RV32I-NEXT:    beqz a5, .LBB27_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    sltu t1, a7, t0
+; RV32I-NEXT:    sltu a7, t1, t0
 ; RV32I-NEXT:  .LBB27_12:
 ; RV32I-NEXT:    add a2, a2, a1
-; RV32I-NEXT:    sltu a5, a2, t1
-; RV32I-NEXT:    sub a4, a4, a5
-; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    add a7, a7, a1
-; RV32I-NEXT:    sub a5, a7, a6
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add t1, t1, a1
+; RV32I-NEXT:    add a1, a4, a1
+; RV32I-NEXT:    sltu a4, a2, a7
+; RV32I-NEXT:    sub a2, a2, a7
+; RV32I-NEXT:    sub a5, t1, a6
+; RV32I-NEXT:    sub a3, a3, a4
 ; RV32I-NEXT:    sw a1, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a4, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: abd_select_i128:
@@ -2005,10 +2005,10 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:  .LBB27_3:
 ; RV64I-NEXT:    neg a1, a0
 ; RV64I-NEXT:    xor a2, a2, a1
-; RV64I-NEXT:    sltu a4, a2, a1
-; RV64I-NEXT:    xor a1, a3, a1
-; RV64I-NEXT:    add a1, a1, a0
-; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    xor a3, a3, a1
+; RV64I-NEXT:    sltu a1, a2, a1
+; RV64I-NEXT:    add a3, a3, a0
+; RV64I-NEXT:    sub a1, a3, a1
 ; RV64I-NEXT:    add a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
@@ -2061,29 +2061,29 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV32ZBB-NEXT:  .LBB27_10:
 ; RV32ZBB-NEXT:    neg t0, a1
 ; RV32ZBB-NEXT:    xor a2, a7, t0
-; RV32ZBB-NEXT:    sltu a4, a2, t0
 ; RV32ZBB-NEXT:    xor a6, a6, t0
-; RV32ZBB-NEXT:    add a6, a6, a1
-; RV32ZBB-NEXT:    sub a4, a6, a4
-; RV32ZBB-NEXT:    xor a3, a3, t0
-; RV32ZBB-NEXT:    sltu a6, a3, t0
-; RV32ZBB-NEXT:    xor a7, a5, t0
-; RV32ZBB-NEXT:    mv t1, a6
+; RV32ZBB-NEXT:    xor a4, a3, t0
+; RV32ZBB-NEXT:    sltu a3, a2, t0
+; RV32ZBB-NEXT:    add a7, a6, a1
+; RV32ZBB-NEXT:    sltu a6, a4, t0
+; RV32ZBB-NEXT:    sub a3, a7, a3
+; RV32ZBB-NEXT:    xor t1, a5, t0
+; RV32ZBB-NEXT:    mv a7, a6
 ; RV32ZBB-NEXT:    beqz a5, .LBB27_12
 ; RV32ZBB-NEXT:  # %bb.11:
-; RV32ZBB-NEXT:    sltu t1, a7, t0
+; RV32ZBB-NEXT:    sltu a7, t1, t0
 ; RV32ZBB-NEXT:  .LBB27_12:
 ; RV32ZBB-NEXT:    add a2, a2, a1
-; RV32ZBB-NEXT:    sltu a5, a2, t1
-; RV32ZBB-NEXT:    sub a4, a4, a5
-; RV32ZBB-NEXT:    sub a2, a2, t1
-; RV32ZBB-NEXT:    add a7, a7, a1
-; RV32ZBB-NEXT:    sub a5, a7, a6
-; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    add t1, t1, a1
+; RV32ZBB-NEXT:    add a1, a4, a1
+; RV32ZBB-NEXT:    sltu a4, a2, a7
+; RV32ZBB-NEXT:    sub a2, a2, a7
+; RV32ZBB-NEXT:    sub a5, t1, a6
+; RV32ZBB-NEXT:    sub a3, a3, a4
 ; RV32ZBB-NEXT:    sw a1, 0(a0)
 ; RV32ZBB-NEXT:    sw a5, 4(a0)
 ; RV32ZBB-NEXT:    sw a2, 8(a0)
-; RV32ZBB-NEXT:    sw a4, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: abd_select_i128:
@@ -2101,10 +2101,10 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; RV64ZBB-NEXT:  .LBB27_3:
 ; RV64ZBB-NEXT:    neg a1, a0
 ; RV64ZBB-NEXT:    xor a2, a2, a1
-; RV64ZBB-NEXT:    sltu a4, a2, a1
-; RV64ZBB-NEXT:    xor a1, a3, a1
-; RV64ZBB-NEXT:    add a1, a1, a0
-; RV64ZBB-NEXT:    sub a1, a1, a4
+; RV64ZBB-NEXT:    xor a3, a3, a1
+; RV64ZBB-NEXT:    sltu a1, a2, a1
+; RV64ZBB-NEXT:    add a3, a3, a0
+; RV64ZBB-NEXT:    sub a1, a3, a1
 ; RV64ZBB-NEXT:    add a0, a2, a0
 ; RV64ZBB-NEXT:    ret
   %cmp = icmp ult i128 %a, %b

diff  --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll
index db7498340d3951..5d4478f9d4b5f0 100644
--- a/llvm/test/CodeGen/RISCV/add-before-shl.ll
+++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll
@@ -171,16 +171,16 @@ define i128 @add_wide_operand(i128 %a) nounwind {
 ; RV32I-NEXT:    lw a1, 12(a1)
 ; RV32I-NEXT:    srli a5, a2, 29
 ; RV32I-NEXT:    slli a6, a3, 3
-; RV32I-NEXT:    or a5, a6, a5
 ; RV32I-NEXT:    srli a3, a3, 29
+; RV32I-NEXT:    or a5, a6, a5
 ; RV32I-NEXT:    slli a6, a4, 3
 ; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    lui a6, 128
 ; RV32I-NEXT:    srli a4, a4, 29
 ; RV32I-NEXT:    slli a1, a1, 3
-; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a2, a2, 3
-; RV32I-NEXT:    lui a4, 128
-; RV32I-NEXT:    add a1, a1, a4
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    add a1, a1, a6
 ; RV32I-NEXT:    sw a2, 0(a0)
 ; RV32I-NEXT:    sw a5, 4(a0)
 ; RV32I-NEXT:    sw a3, 8(a0)
@@ -191,8 +191,8 @@ define i128 @add_wide_operand(i128 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a2, a0, 61
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    or a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 3
+; RV64I-NEXT:    or a1, a1, a2
 ; RV64I-NEXT:    addi a2, zero, 1
 ; RV64I-NEXT:    slli a2, a2, 51
 ; RV64I-NEXT:    add a1, a1, a2
@@ -200,23 +200,23 @@ define i128 @add_wide_operand(i128 %a) nounwind {
 ;
 ; RV32C-LABEL: add_wide_operand:
 ; RV32C:       # %bb.0:
-; RV32C-NEXT:    c.lw a2, 12(a1)
-; RV32C-NEXT:    lw a6, 0(a1)
-; RV32C-NEXT:    c.lw a3, 4(a1)
+; RV32C-NEXT:    c.lw a4, 12(a1)
+; RV32C-NEXT:    c.lw a3, 0(a1)
+; RV32C-NEXT:    c.lw a2, 4(a1)
 ; RV32C-NEXT:    c.lw a1, 8(a1)
 ; RV32C-NEXT:    c.lui a5, 16
-; RV32C-NEXT:    c.add a2, a5
-; RV32C-NEXT:    c.slli a2, 3
-; RV32C-NEXT:    srli a5, a1, 29
-; RV32C-NEXT:    c.or a2, a5
-; RV32C-NEXT:    srli a5, a6, 29
-; RV32C-NEXT:    slli a4, a3, 3
+; RV32C-NEXT:    add a6, a4, a5
+; RV32C-NEXT:    srli a5, a3, 29
+; RV32C-NEXT:    slli a4, a2, 3
 ; RV32C-NEXT:    c.or a4, a5
-; RV32C-NEXT:    c.srli a3, 29
+; RV32C-NEXT:    srli a5, a1, 29
+; RV32C-NEXT:    c.srli a2, 29
 ; RV32C-NEXT:    c.slli a1, 3
-; RV32C-NEXT:    c.or a1, a3
+; RV32C-NEXT:    c.slli a3, 3
 ; RV32C-NEXT:    c.slli a6, 3
-; RV32C-NEXT:    sw a6, 0(a0)
+; RV32C-NEXT:    c.or a1, a2
+; RV32C-NEXT:    or a2, a6, a5
+; RV32C-NEXT:    c.sw a3, 0(a0)
 ; RV32C-NEXT:    c.sw a4, 4(a0)
 ; RV32C-NEXT:    c.sw a1, 8(a0)
 ; RV32C-NEXT:    c.sw a2, 12(a0)
@@ -226,8 +226,8 @@ define i128 @add_wide_operand(i128 %a) nounwind {
 ; RV64C:       # %bb.0:
 ; RV64C-NEXT:    srli a2, a0, 61
 ; RV64C-NEXT:    c.slli a1, 3
-; RV64C-NEXT:    c.or a1, a2
 ; RV64C-NEXT:    c.slli a0, 3
+; RV64C-NEXT:    c.or a1, a2
 ; RV64C-NEXT:    c.li a2, 1
 ; RV64C-NEXT:    c.slli a2, 51
 ; RV64C-NEXT:    c.add a1, a2

diff  --git a/llvm/test/CodeGen/RISCV/add-imm.ll b/llvm/test/CodeGen/RISCV/add-imm.ll
index 52751f1c224211..84deb4c00ac8d1 100644
--- a/llvm/test/CodeGen/RISCV/add-imm.ll
+++ b/llvm/test/CodeGen/RISCV/add-imm.ll
@@ -213,29 +213,29 @@ define void @add32_reject() nounwind {
 ; RV32I-LABEL: add32_reject:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, %hi(ga)
-; RV32I-NEXT:    lw a1, %lo(ga)(a0)
-; RV32I-NEXT:    lui a2, %hi(gb)
-; RV32I-NEXT:    lw a3, %lo(gb)(a2)
+; RV32I-NEXT:    lui a1, %hi(gb)
+; RV32I-NEXT:    lw a2, %lo(ga)(a0)
+; RV32I-NEXT:    lw a3, %lo(gb)(a1)
 ; RV32I-NEXT:    lui a4, 1
 ; RV32I-NEXT:    addi a4, a4, -1096
-; RV32I-NEXT:    add a1, a1, a4
+; RV32I-NEXT:    add a2, a2, a4
 ; RV32I-NEXT:    add a3, a3, a4
-; RV32I-NEXT:    sw a1, %lo(ga)(a0)
-; RV32I-NEXT:    sw a3, %lo(gb)(a2)
+; RV32I-NEXT:    sw a2, %lo(ga)(a0)
+; RV32I-NEXT:    sw a3, %lo(gb)(a1)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: add32_reject:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a0, %hi(ga)
-; RV64I-NEXT:    lw a1, %lo(ga)(a0)
-; RV64I-NEXT:    lui a2, %hi(gb)
-; RV64I-NEXT:    lw a3, %lo(gb)(a2)
+; RV64I-NEXT:    lui a1, %hi(gb)
+; RV64I-NEXT:    lw a2, %lo(ga)(a0)
+; RV64I-NEXT:    lw a3, %lo(gb)(a1)
 ; RV64I-NEXT:    lui a4, 1
 ; RV64I-NEXT:    addi a4, a4, -1096
-; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    add a2, a2, a4
 ; RV64I-NEXT:    add a3, a3, a4
-; RV64I-NEXT:    sw a1, %lo(ga)(a0)
-; RV64I-NEXT:    sw a3, %lo(gb)(a2)
+; RV64I-NEXT:    sw a2, %lo(ga)(a0)
+; RV64I-NEXT:    sw a3, %lo(gb)(a1)
 ; RV64I-NEXT:    ret
   %1 = load i32, ptr @ga, align 4
   %2 = load i32, ptr @gb, align 4

diff  --git a/llvm/test/CodeGen/RISCV/addcarry.ll b/llvm/test/CodeGen/RISCV/addcarry.ll
index 3a4163a8bb50f9..ff0d1e75c746c5 100644
--- a/llvm/test/CodeGen/RISCV/addcarry.ll
+++ b/llvm/test/CodeGen/RISCV/addcarry.ll
@@ -12,16 +12,16 @@ define i64 @addcarry(i64 %x, i64 %y) nounwind {
 ; RISCV32:       # %bb.0:
 ; RISCV32-NEXT:    mul a4, a0, a3
 ; RISCV32-NEXT:    mulhu a5, a0, a2
-; RISCV32-NEXT:    add a6, a5, a4
-; RISCV32-NEXT:    mul a4, a1, a2
-; RISCV32-NEXT:    add a4, a6, a4
-; RISCV32-NEXT:    sltu a7, a4, a6
-; RISCV32-NEXT:    sltu a5, a6, a5
-; RISCV32-NEXT:    mulhu a6, a0, a3
+; RISCV32-NEXT:    mul a6, a1, a2
+; RISCV32-NEXT:    mulhu a7, a0, a3
 ; RISCV32-NEXT:    mulhu t0, a1, a2
-; RISCV32-NEXT:    add a6, a6, t0
-; RISCV32-NEXT:    add a5, a6, a5
-; RISCV32-NEXT:    add a5, a5, a7
+; RISCV32-NEXT:    add t1, a5, a4
+; RISCV32-NEXT:    add a7, a7, t0
+; RISCV32-NEXT:    add a4, t1, a6
+; RISCV32-NEXT:    sltu a5, t1, a5
+; RISCV32-NEXT:    sltu a6, a4, t1
+; RISCV32-NEXT:    add a5, a7, a5
+; RISCV32-NEXT:    add a5, a5, a6
 ; RISCV32-NEXT:    mul a6, a1, a3
 ; RISCV32-NEXT:    add a5, a5, a6
 ; RISCV32-NEXT:    bgez a1, .LBB0_2
@@ -34,9 +34,9 @@ define i64 @addcarry(i64 %x, i64 %y) nounwind {
 ; RISCV32-NEXT:  .LBB0_4:
 ; RISCV32-NEXT:    slli a5, a5, 30
 ; RISCV32-NEXT:    srli a1, a4, 2
-; RISCV32-NEXT:    or a1, a5, a1
 ; RISCV32-NEXT:    slli a4, a4, 30
 ; RISCV32-NEXT:    mul a0, a0, a2
+; RISCV32-NEXT:    or a1, a5, a1
 ; RISCV32-NEXT:    srli a0, a0, 2
 ; RISCV32-NEXT:    or a0, a4, a0
 ; RISCV32-NEXT:    ret
@@ -49,8 +49,8 @@ define { i32, i32, i1 } @addcarry_2x32(i32 %x0, i32 %x1, i32 %y0, i32 %y1) nounw
 ; RISCV32-LABEL: addcarry_2x32:
 ; RISCV32:       # %bb.0:
 ; RISCV32-NEXT:    add a3, a1, a3
-; RISCV32-NEXT:    sltu a1, a3, a1
 ; RISCV32-NEXT:    add a4, a2, a4
+; RISCV32-NEXT:    sltu a1, a3, a1
 ; RISCV32-NEXT:    sltu a2, a4, a2
 ; RISCV32-NEXT:    add a1, a4, a1
 ; RISCV32-NEXT:    sltu a4, a1, a4

diff  --git a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
index a18526718461ef..8e445511b61195 100644
--- a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
+++ b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
@@ -53,16 +53,16 @@ define i64 @add_mul_combine_accept_a3(i64 %x) {
 ; RV32IMB-LABEL: add_mul_combine_accept_a3:
 ; RV32IMB:       # %bb.0:
 ; RV32IMB-NEXT:    li a2, 29
-; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh1add a3, a1, a1
 ; RV32IMB-NEXT:    slli a1, a1, 5
 ; RV32IMB-NEXT:    sub a1, a1, a3
-; RV32IMB-NEXT:    add a1, a2, a1
-; RV32IMB-NEXT:    sh1add a2, a0, a0
+; RV32IMB-NEXT:    sh1add a3, a0, a0
+; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    slli a0, a0, 5
-; RV32IMB-NEXT:    sub a2, a0, a2
-; RV32IMB-NEXT:    addi a0, a2, 1073
-; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    sub a3, a0, a3
+; RV32IMB-NEXT:    add a1, a2, a1
+; RV32IMB-NEXT:    addi a0, a3, 1073
+; RV32IMB-NEXT:    sltu a2, a0, a3
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
@@ -132,18 +132,18 @@ define i64 @add_mul_combine_accept_b3(i64 %x) {
 ; RV32IMB-LABEL: add_mul_combine_accept_b3:
 ; RV32IMB:       # %bb.0:
 ; RV32IMB-NEXT:    li a2, 23
-; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh3add a3, a1, a1
 ; RV32IMB-NEXT:    slli a1, a1, 5
 ; RV32IMB-NEXT:    sub a1, a1, a3
-; RV32IMB-NEXT:    add a1, a2, a1
-; RV32IMB-NEXT:    sh3add a2, a0, a0
+; RV32IMB-NEXT:    sh3add a3, a0, a0
+; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    slli a0, a0, 5
-; RV32IMB-NEXT:    sub a2, a0, a2
+; RV32IMB-NEXT:    sub a3, a0, a3
 ; RV32IMB-NEXT:    lui a0, 50
 ; RV32IMB-NEXT:    addi a0, a0, 1119
-; RV32IMB-NEXT:    add a0, a2, a0
-; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    add a1, a2, a1
+; RV32IMB-NEXT:    add a0, a3, a0
+; RV32IMB-NEXT:    sltu a2, a0, a3
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
@@ -207,18 +207,18 @@ define i64 @add_mul_combine_reject_a3(i64 %x) {
 ; RV32IMB-LABEL: add_mul_combine_reject_a3:
 ; RV32IMB:       # %bb.0:
 ; RV32IMB-NEXT:    li a2, 29
-; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh1add a3, a1, a1
 ; RV32IMB-NEXT:    slli a1, a1, 5
 ; RV32IMB-NEXT:    sub a1, a1, a3
-; RV32IMB-NEXT:    add a1, a2, a1
-; RV32IMB-NEXT:    sh1add a2, a0, a0
+; RV32IMB-NEXT:    sh1add a3, a0, a0
+; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    slli a0, a0, 5
-; RV32IMB-NEXT:    sub a2, a0, a2
+; RV32IMB-NEXT:    sub a3, a0, a3
 ; RV32IMB-NEXT:    lui a0, 14
 ; RV32IMB-NEXT:    addi a0, a0, -185
-; RV32IMB-NEXT:    add a0, a2, a0
-; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    add a1, a2, a1
+; RV32IMB-NEXT:    add a0, a3, a0
+; RV32IMB-NEXT:    sltu a2, a0, a3
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
@@ -278,16 +278,16 @@ define i64 @add_mul_combine_reject_c3(i64 %x) {
 ; RV32IMB-LABEL: add_mul_combine_reject_c3:
 ; RV32IMB:       # %bb.0:
 ; RV32IMB-NEXT:    li a2, 73
-; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh3add a3, a1, a1
 ; RV32IMB-NEXT:    sh3add a1, a3, a1
-; RV32IMB-NEXT:    add a1, a2, a1
-; RV32IMB-NEXT:    sh3add a2, a0, a0
-; RV32IMB-NEXT:    sh3add a2, a2, a0
+; RV32IMB-NEXT:    sh3add a3, a0, a0
+; RV32IMB-NEXT:    mulhu a2, a0, a2
+; RV32IMB-NEXT:    sh3add a3, a3, a0
 ; RV32IMB-NEXT:    lui a0, 18
 ; RV32IMB-NEXT:    addi a0, a0, -728
-; RV32IMB-NEXT:    add a0, a2, a0
-; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    add a1, a2, a1
+; RV32IMB-NEXT:    add a0, a3, a0
+; RV32IMB-NEXT:    sltu a2, a0, a3
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
@@ -344,16 +344,16 @@ define i64 @add_mul_combine_reject_d3(i64 %x) {
 ; RV32IMB-LABEL: add_mul_combine_reject_d3:
 ; RV32IMB:       # %bb.0:
 ; RV32IMB-NEXT:    li a2, 192
-; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh1add a1, a1, a1
+; RV32IMB-NEXT:    mulhu a2, a0, a2
+; RV32IMB-NEXT:    sh1add a0, a0, a0
 ; RV32IMB-NEXT:    slli a1, a1, 6
 ; RV32IMB-NEXT:    add a1, a2, a1
-; RV32IMB-NEXT:    sh1add a0, a0, a0
-; RV32IMB-NEXT:    slli a2, a0, 6
-; RV32IMB-NEXT:    lui a0, 47
-; RV32IMB-NEXT:    addi a0, a0, -512
-; RV32IMB-NEXT:    add a0, a2, a0
-; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    lui a2, 47
+; RV32IMB-NEXT:    slli a3, a0, 6
+; RV32IMB-NEXT:    addi a0, a2, -512
+; RV32IMB-NEXT:    add a0, a3, a0
+; RV32IMB-NEXT:    sltu a2, a0, a3
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
@@ -414,18 +414,18 @@ define i64 @add_mul_combine_reject_e3(i64 %x) {
 ; RV32IMB-LABEL: add_mul_combine_reject_e3:
 ; RV32IMB:       # %bb.0:
 ; RV32IMB-NEXT:    li a2, 29
-; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh1add a3, a1, a1
 ; RV32IMB-NEXT:    slli a1, a1, 5
 ; RV32IMB-NEXT:    sub a1, a1, a3
-; RV32IMB-NEXT:    add a1, a2, a1
-; RV32IMB-NEXT:    sh1add a2, a0, a0
+; RV32IMB-NEXT:    sh1add a3, a0, a0
+; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    slli a0, a0, 5
-; RV32IMB-NEXT:    sub a2, a0, a2
+; RV32IMB-NEXT:    sub a3, a0, a3
 ; RV32IMB-NEXT:    lui a0, 14
 ; RV32IMB-NEXT:    addi a0, a0, -185
-; RV32IMB-NEXT:    add a0, a2, a0
-; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    add a1, a2, a1
+; RV32IMB-NEXT:    add a0, a3, a0
+; RV32IMB-NEXT:    sltu a2, a0, a3
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
@@ -491,18 +491,18 @@ define i64 @add_mul_combine_reject_f3(i64 %x) {
 ; RV32IMB-LABEL: add_mul_combine_reject_f3:
 ; RV32IMB:       # %bb.0:
 ; RV32IMB-NEXT:    li a2, 29
-; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh1add a3, a1, a1
 ; RV32IMB-NEXT:    slli a1, a1, 5
 ; RV32IMB-NEXT:    sub a1, a1, a3
-; RV32IMB-NEXT:    add a1, a2, a1
-; RV32IMB-NEXT:    sh1add a2, a0, a0
+; RV32IMB-NEXT:    sh1add a3, a0, a0
+; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    slli a0, a0, 5
-; RV32IMB-NEXT:    sub a2, a0, a2
+; RV32IMB-NEXT:    sub a3, a0, a3
 ; RV32IMB-NEXT:    lui a0, 14
 ; RV32IMB-NEXT:    addi a0, a0, -145
-; RV32IMB-NEXT:    add a0, a2, a0
-; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    add a1, a2, a1
+; RV32IMB-NEXT:    add a0, a3, a0
+; RV32IMB-NEXT:    sltu a2, a0, a3
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
@@ -565,16 +565,16 @@ define i64 @add_mul_combine_reject_g3(i64 %x) {
 ; RV32IMB-LABEL: add_mul_combine_reject_g3:
 ; RV32IMB:       # %bb.0:
 ; RV32IMB-NEXT:    li a2, 73
-; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh3add a3, a1, a1
 ; RV32IMB-NEXT:    sh3add a1, a3, a1
-; RV32IMB-NEXT:    add a1, a2, a1
-; RV32IMB-NEXT:    sh3add a2, a0, a0
-; RV32IMB-NEXT:    sh3add a2, a2, a0
+; RV32IMB-NEXT:    sh3add a3, a0, a0
+; RV32IMB-NEXT:    mulhu a2, a0, a2
+; RV32IMB-NEXT:    sh3add a3, a3, a0
 ; RV32IMB-NEXT:    lui a0, 2
 ; RV32IMB-NEXT:    addi a0, a0, -882
-; RV32IMB-NEXT:    add a0, a2, a0
-; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    add a1, a2, a1
+; RV32IMB-NEXT:    add a0, a3, a0
+; RV32IMB-NEXT:    sltu a2, a0, a3
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
@@ -595,15 +595,15 @@ define i64 @add_mul_combine_infinite_loop(i64 %x) {
 ; RV32IMB-LABEL: add_mul_combine_infinite_loop:
 ; RV32IMB:       # %bb.0:
 ; RV32IMB-NEXT:    li a2, 24
-; RV32IMB-NEXT:    mulhu a2, a0, a2
 ; RV32IMB-NEXT:    sh1add a1, a1, a1
-; RV32IMB-NEXT:    sh3add a1, a1, a2
-; RV32IMB-NEXT:    sh1add a0, a0, a0
-; RV32IMB-NEXT:    slli a2, a0, 3
-; RV32IMB-NEXT:    li a3, 1
-; RV32IMB-NEXT:    slli a3, a3, 11
-; RV32IMB-NEXT:    sh3add a0, a0, a3
-; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    sh1add a3, a0, a0
+; RV32IMB-NEXT:    mulhu a0, a0, a2
+; RV32IMB-NEXT:    li a2, 1
+; RV32IMB-NEXT:    sh3add a1, a1, a0
+; RV32IMB-NEXT:    slli a4, a3, 3
+; RV32IMB-NEXT:    slli a2, a2, 11
+; RV32IMB-NEXT:    sh3add a0, a3, a2
+; RV32IMB-NEXT:    sltu a2, a0, a4
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
 ;
@@ -672,10 +672,10 @@ define i64 @mul3000_add8990_c(i64 %x) {
 ; RV32IMB-NEXT:    addi a2, a2, -1096
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
-; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
 ; RV32IMB-NEXT:    lui a0, 2
 ; RV32IMB-NEXT:    addi a0, a0, 798
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    add a0, a2, a0
 ; RV32IMB-NEXT:    sltu a2, a0, a2
 ; RV32IMB-NEXT:    add a1, a1, a2
@@ -747,10 +747,10 @@ define i64 @mul3000_sub8990_c(i64 %x) {
 ; RV32IMB-NEXT:    addi a2, a2, -1096
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
-; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
 ; RV32IMB-NEXT:    lui a0, 1048574
 ; RV32IMB-NEXT:    addi a0, a0, -798
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    add a0, a2, a0
 ; RV32IMB-NEXT:    sltu a2, a0, a2
 ; RV32IMB-NEXT:    add a1, a1, a2
@@ -823,12 +823,12 @@ define i64 @mulneg3000_add8990_c(i64 %x) {
 ; RV32IMB-NEXT:    addi a2, a2, 1096
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
-; RV32IMB-NEXT:    sub a3, a3, a0
-; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
+; RV32IMB-NEXT:    sub a3, a3, a0
 ; RV32IMB-NEXT:    lui a0, 2
 ; RV32IMB-NEXT:    addi a0, a0, 798
 ; RV32IMB-NEXT:    add a0, a2, a0
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    sltu a2, a0, a2
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    ret
@@ -899,12 +899,12 @@ define i64 @mulneg3000_sub8990_c(i64 %x) {
 ; RV32IMB-NEXT:    addi a2, a2, 1096
 ; RV32IMB-NEXT:    mul a1, a1, a2
 ; RV32IMB-NEXT:    mulhu a3, a0, a2
-; RV32IMB-NEXT:    sub a3, a3, a0
-; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    mul a2, a0, a2
+; RV32IMB-NEXT:    sub a3, a3, a0
 ; RV32IMB-NEXT:    lui a0, 1048574
 ; RV32IMB-NEXT:    addi a0, a0, -798
 ; RV32IMB-NEXT:    add a0, a2, a0
+; RV32IMB-NEXT:    add a1, a3, a1
 ; RV32IMB-NEXT:    sltu a2, a0, a2
 ; RV32IMB-NEXT:    add a1, a1, a2
 ; RV32IMB-NEXT:    addi a1, a1, -1

diff  --git a/llvm/test/CodeGen/RISCV/alu16.ll b/llvm/test/CodeGen/RISCV/alu16.ll
index cb28ccdda0a54b..41f26526ef03ec 100644
--- a/llvm/test/CodeGen/RISCV/alu16.ll
+++ b/llvm/test/CodeGen/RISCV/alu16.ll
@@ -254,8 +254,8 @@ define i16 @slt(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: slt:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    slt a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -263,8 +263,8 @@ define i16 @slt(i16 %a, i16 %b) nounwind {
 ; RV64I-LABEL: slt:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    slt a0, a0, a1
 ; RV64I-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/alu8.ll b/llvm/test/CodeGen/RISCV/alu8.ll
index d563525be7a356..6ae96e7c9deae4 100644
--- a/llvm/test/CodeGen/RISCV/alu8.ll
+++ b/llvm/test/CodeGen/RISCV/alu8.ll
@@ -252,8 +252,8 @@ define i8 @slt(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: slt:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    slt a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -261,8 +261,8 @@ define i8 @slt(i8 %a, i8 %b) nounwind {
 ; RV64I-LABEL: slt:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    slt a0, a0, a1
 ; RV64I-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/and.ll b/llvm/test/CodeGen/RISCV/and.ll
index 79e3b954c50d8d..31c63c7f9b18f5 100644
--- a/llvm/test/CodeGen/RISCV/and.ll
+++ b/llvm/test/CodeGen/RISCV/and.ll
@@ -124,8 +124,8 @@ define i64 @and64_0x7ffffffffffff000(i64 %x) {
 ; RV32I-LABEL: and64_0x7ffffffffffff000:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 1048575
-; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll b/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
index 234a956be809ed..741860db13957a 100644
--- a/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-cmpxchg-branch-on-result.ll
@@ -99,10 +99,10 @@ define void @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %v
 ; RV32IA-NEXT:    andi a3, a0, -4
 ; RV32IA-NEXT:    slli a4, a0, 3
 ; RV32IA-NEXT:    li a0, 255
-; RV32IA-NEXT:    sll a0, a0, a4
 ; RV32IA-NEXT:    andi a1, a1, 255
-; RV32IA-NEXT:    sll a1, a1, a4
 ; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a0, a0, a4
+; RV32IA-NEXT:    sll a1, a1, a4
 ; RV32IA-NEXT:    sll a2, a2, a4
 ; RV32IA-NEXT:  .LBB2_1: # %do_cmpxchg
 ; RV32IA-NEXT:    # =>This Loop Header: Depth=1
@@ -129,10 +129,10 @@ define void @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %v
 ; RV32IA-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-ZACAS-NEXT:    slli a4, a0, 3
 ; RV32IA-ZACAS-NEXT:    li a0, 255
-; RV32IA-ZACAS-NEXT:    sll a0, a0, a4
 ; RV32IA-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-ZACAS-NEXT:    sll a1, a1, a4
 ; RV32IA-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-ZACAS-NEXT:    sll a0, a0, a4
+; RV32IA-ZACAS-NEXT:    sll a1, a1, a4
 ; RV32IA-ZACAS-NEXT:    sll a2, a2, a4
 ; RV32IA-ZACAS-NEXT:  .LBB2_1: # %do_cmpxchg
 ; RV32IA-ZACAS-NEXT:    # =>This Loop Header: Depth=1
@@ -159,10 +159,10 @@ define void @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %v
 ; RV64IA-NEXT:    andi a3, a0, -4
 ; RV64IA-NEXT:    slli a4, a0, 3
 ; RV64IA-NEXT:    li a0, 255
-; RV64IA-NEXT:    sllw a0, a0, a4
 ; RV64IA-NEXT:    andi a1, a1, 255
-; RV64IA-NEXT:    sllw a1, a1, a4
 ; RV64IA-NEXT:    andi a2, a2, 255
+; RV64IA-NEXT:    sllw a0, a0, a4
+; RV64IA-NEXT:    sllw a1, a1, a4
 ; RV64IA-NEXT:    sllw a2, a2, a4
 ; RV64IA-NEXT:  .LBB2_1: # %do_cmpxchg
 ; RV64IA-NEXT:    # =>This Loop Header: Depth=1
@@ -189,10 +189,10 @@ define void @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %v
 ; RV64IA-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a4, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a0, 255
-; RV64IA-ZACAS-NEXT:    sllw a0, a0, a4
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a4
 ; RV64IA-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-ZACAS-NEXT:    sllw a0, a0, a4
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a4
 ; RV64IA-ZACAS-NEXT:    sllw a2, a2, a4
 ; RV64IA-ZACAS-NEXT:  .LBB2_1: # %do_cmpxchg
 ; RV64IA-ZACAS-NEXT:    # =>This Loop Header: Depth=1
@@ -240,10 +240,10 @@ define void @cmpxchg_masked_and_branch2(ptr %ptr, i8 signext %cmp, i8 signext %v
 ; RV32IA-NEXT:    andi a3, a0, -4
 ; RV32IA-NEXT:    slli a4, a0, 3
 ; RV32IA-NEXT:    li a0, 255
-; RV32IA-NEXT:    sll a0, a0, a4
 ; RV32IA-NEXT:    andi a1, a1, 255
-; RV32IA-NEXT:    sll a1, a1, a4
 ; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a0, a0, a4
+; RV32IA-NEXT:    sll a1, a1, a4
 ; RV32IA-NEXT:    sll a2, a2, a4
 ; RV32IA-NEXT:  .LBB3_1: # %do_cmpxchg
 ; RV32IA-NEXT:    # =>This Loop Header: Depth=1
@@ -273,10 +273,10 @@ define void @cmpxchg_masked_and_branch2(ptr %ptr, i8 signext %cmp, i8 signext %v
 ; RV32IA-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-ZACAS-NEXT:    slli a4, a0, 3
 ; RV32IA-ZACAS-NEXT:    li a0, 255
-; RV32IA-ZACAS-NEXT:    sll a0, a0, a4
 ; RV32IA-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-ZACAS-NEXT:    sll a1, a1, a4
 ; RV32IA-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-ZACAS-NEXT:    sll a0, a0, a4
+; RV32IA-ZACAS-NEXT:    sll a1, a1, a4
 ; RV32IA-ZACAS-NEXT:    sll a2, a2, a4
 ; RV32IA-ZACAS-NEXT:  .LBB3_1: # %do_cmpxchg
 ; RV32IA-ZACAS-NEXT:    # =>This Loop Header: Depth=1
@@ -306,10 +306,10 @@ define void @cmpxchg_masked_and_branch2(ptr %ptr, i8 signext %cmp, i8 signext %v
 ; RV64IA-NEXT:    andi a3, a0, -4
 ; RV64IA-NEXT:    slli a4, a0, 3
 ; RV64IA-NEXT:    li a0, 255
-; RV64IA-NEXT:    sllw a0, a0, a4
 ; RV64IA-NEXT:    andi a1, a1, 255
-; RV64IA-NEXT:    sllw a1, a1, a4
 ; RV64IA-NEXT:    andi a2, a2, 255
+; RV64IA-NEXT:    sllw a0, a0, a4
+; RV64IA-NEXT:    sllw a1, a1, a4
 ; RV64IA-NEXT:    sllw a2, a2, a4
 ; RV64IA-NEXT:  .LBB3_1: # %do_cmpxchg
 ; RV64IA-NEXT:    # =>This Loop Header: Depth=1
@@ -339,10 +339,10 @@ define void @cmpxchg_masked_and_branch2(ptr %ptr, i8 signext %cmp, i8 signext %v
 ; RV64IA-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a4, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a0, 255
-; RV64IA-ZACAS-NEXT:    sllw a0, a0, a4
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a4
 ; RV64IA-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-ZACAS-NEXT:    sllw a0, a0, a4
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a4
 ; RV64IA-ZACAS-NEXT:    sllw a2, a2, a4
 ; RV64IA-ZACAS-NEXT:  .LBB3_1: # %do_cmpxchg
 ; RV64IA-ZACAS-NEXT:    # =>This Loop Header: Depth=1

diff  --git a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
index a87b49e61a8dbc..c3b972840377f8 100644
--- a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
@@ -43,10 +43,10 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind
 ; RV32IA-NEXT:    andi a3, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a2, (a3)
@@ -79,10 +79,10 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w a2, (a3)
@@ -102,10 +102,10 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind
 ; RV64IA-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a4, 255
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-ZACAS-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a2, (a3)
@@ -130,10 +130,10 @@ define void @cmpxchg_i8_monotonic_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -170,10 +170,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -193,10 +193,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -216,10 +216,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -239,10 +239,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -275,10 +275,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -298,10 +298,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -326,10 +326,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -349,10 +349,10 @@ define void @cmpxchg_i8_acquire_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -394,10 +394,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -417,10 +417,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -440,10 +440,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -463,10 +463,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -499,10 +499,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -522,10 +522,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -550,10 +550,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -573,10 +573,10 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -618,10 +618,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a2, (a3)
@@ -641,10 +641,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -664,10 +664,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -687,10 +687,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -723,10 +723,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w a2, (a3)
@@ -746,10 +746,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -774,10 +774,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -797,10 +797,10 @@ define void @cmpxchg_i8_release_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -842,10 +842,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -865,10 +865,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -888,10 +888,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -911,10 +911,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -947,10 +947,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -970,10 +970,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -998,10 +998,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -1021,10 +1021,10 @@ define void @cmpxchg_i8_release_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -1066,10 +1066,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -1089,10 +1089,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -1112,10 +1112,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -1135,10 +1135,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -1171,10 +1171,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -1194,10 +1194,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -1222,10 +1222,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -1245,10 +1245,10 @@ define void @cmpxchg_i8_acq_rel_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -1290,10 +1290,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -1313,10 +1313,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-WMO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -1336,10 +1336,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -1359,10 +1359,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV32IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV32IA-TSO-ZACAS-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -1395,10 +1395,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -1418,10 +1418,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -1446,10 +1446,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -1469,10 +1469,10 @@ define void @cmpxchg_i8_acq_rel_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -1514,10 +1514,10 @@ define void @cmpxchg_i8_seq_cst_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-NEXT:    andi a3, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a2, (a3)
@@ -1550,10 +1550,10 @@ define void @cmpxchg_i8_seq_cst_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aqrl a2, (a3)
@@ -1573,10 +1573,10 @@ define void @cmpxchg_i8_seq_cst_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a4, 255
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-ZACAS-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a2, (a3)
@@ -1601,10 +1601,10 @@ define void @cmpxchg_i8_seq_cst_monotonic(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w.aqrl a2, (a3)
@@ -1646,10 +1646,10 @@ define void @cmpxchg_i8_seq_cst_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-NEXT:    andi a3, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a2, (a3)
@@ -1682,10 +1682,10 @@ define void @cmpxchg_i8_seq_cst_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aqrl a2, (a3)
@@ -1705,10 +1705,10 @@ define void @cmpxchg_i8_seq_cst_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a4, 255
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-ZACAS-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a2, (a3)
@@ -1733,10 +1733,10 @@ define void @cmpxchg_i8_seq_cst_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB8_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w.aqrl a2, (a3)
@@ -1778,10 +1778,10 @@ define void @cmpxchg_i8_seq_cst_seq_cst(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV32IA-NEXT:    andi a3, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a2, (a3)
@@ -1814,10 +1814,10 @@ define void @cmpxchg_i8_seq_cst_seq_cst(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-WMO-NEXT:    andi a3, a0, -4
 ; RV64IA-WMO-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NEXT:    li a4, 255
-; RV64IA-WMO-NEXT:    sllw a4, a4, a0
 ; RV64IA-WMO-NEXT:    andi a1, a1, 255
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    andi a2, a2, 255
+; RV64IA-WMO-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aqrl a2, (a3)
@@ -1837,10 +1837,10 @@ define void @cmpxchg_i8_seq_cst_seq_cst(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a3, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a4, 255
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    andi a2, a2, 255
+; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-ZACAS-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a2, (a3)
@@ -1866,10 +1866,10 @@ define void @cmpxchg_i8_seq_cst_seq_cst(ptr %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-TSO-NEXT:    andi a3, a0, -4
 ; RV64IA-TSO-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NEXT:    li a4, 255
-; RV64IA-TSO-NEXT:    sllw a4, a4, a0
 ; RV64IA-TSO-NEXT:    andi a1, a1, 255
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    andi a2, a2, 255
+; RV64IA-TSO-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB9_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w.aqrl a2, (a3)
@@ -1915,8 +1915,8 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw
 ; RV32IA-NEXT:    addi a4, a4, -1
 ; RV32IA-NEXT:    sll a5, a4, a0
 ; RV32IA-NEXT:    and a1, a1, a4
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    and a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a2, (a3)
@@ -1952,8 +1952,8 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w a2, (a3)
@@ -1976,8 +1976,8 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw
 ; RV64IA-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-ZACAS-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2005,8 +2005,8 @@ define void @cmpxchg_i16_monotonic_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounw
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -2046,8 +2046,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-WMO-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-NEXT:    and a1, a1, a4
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    and a2, a2, a4
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -2070,8 +2070,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -2094,8 +2094,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-TSO-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-NEXT:    and a1, a1, a4
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    and a2, a2, a4
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -2118,8 +2118,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2155,8 +2155,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -2179,8 +2179,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -2208,8 +2208,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -2232,8 +2232,8 @@ define void @cmpxchg_i16_acquire_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2278,8 +2278,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-WMO-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-NEXT:    and a1, a1, a4
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    and a2, a2, a4
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -2302,8 +2302,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -2326,8 +2326,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-TSO-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-NEXT:    and a1, a1, a4
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    and a2, a2, a4
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -2350,8 +2350,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2387,8 +2387,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -2411,8 +2411,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -2440,8 +2440,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -2464,8 +2464,8 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2510,8 +2510,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-WMO-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-NEXT:    and a1, a1, a4
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    and a2, a2, a4
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a2, (a3)
@@ -2534,8 +2534,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2558,8 +2558,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-TSO-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-NEXT:    and a1, a1, a4
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    and a2, a2, a4
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -2582,8 +2582,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2619,8 +2619,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w a2, (a3)
@@ -2643,8 +2643,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2672,8 +2672,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -2696,8 +2696,8 @@ define void @cmpxchg_i16_release_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2742,8 +2742,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-WMO-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-NEXT:    and a1, a1, a4
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    and a2, a2, a4
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -2766,8 +2766,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -2790,8 +2790,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-TSO-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-NEXT:    and a1, a1, a4
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    and a2, a2, a4
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -2814,8 +2814,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2851,8 +2851,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -2875,8 +2875,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -2904,8 +2904,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -2928,8 +2928,8 @@ define void @cmpxchg_i16_release_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB14_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -2974,8 +2974,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-WMO-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-NEXT:    and a1, a1, a4
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    and a2, a2, a4
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -2998,8 +2998,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -3022,8 +3022,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-TSO-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-NEXT:    and a1, a1, a4
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    and a2, a2, a4
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -3046,8 +3046,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -3083,8 +3083,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -3107,8 +3107,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -3136,8 +3136,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -3160,8 +3160,8 @@ define void @cmpxchg_i16_acq_rel_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -3206,8 +3206,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-WMO-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-NEXT:    and a1, a1, a4
-; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    and a2, a2, a4
+; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -3230,8 +3230,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-WMO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-WMO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-WMO-ZACAS-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -3254,8 +3254,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-TSO-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-NEXT:    and a1, a1, a4
-; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    and a2, a2, a4
+; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a2, (a3)
@@ -3278,8 +3278,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV32IA-TSO-ZACAS-NEXT:    sll a5, a4, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV32IA-TSO-ZACAS-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-ZACAS-NEXT:    sll a0, a2, a0
 ; RV32IA-TSO-ZACAS-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -3315,8 +3315,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aq a2, (a3)
@@ -3339,8 +3339,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a2, (a3)
@@ -3368,8 +3368,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w a2, (a3)
@@ -3392,8 +3392,8 @@ define void @cmpxchg_i16_acq_rel_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a2, (a3)
@@ -3438,8 +3438,8 @@ define void @cmpxchg_i16_seq_cst_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV32IA-NEXT:    addi a4, a4, -1
 ; RV32IA-NEXT:    sll a5, a4, a0
 ; RV32IA-NEXT:    and a1, a1, a4
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    and a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a2, (a3)
@@ -3475,8 +3475,8 @@ define void @cmpxchg_i16_seq_cst_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aqrl a2, (a3)
@@ -3499,8 +3499,8 @@ define void @cmpxchg_i16_seq_cst_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-ZACAS-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a2, (a3)
@@ -3528,8 +3528,8 @@ define void @cmpxchg_i16_seq_cst_monotonic(ptr %ptr, i16 %cmp, i16 %val) nounwin
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w.aqrl a2, (a3)
@@ -3574,8 +3574,8 @@ define void @cmpxchg_i16_seq_cst_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-NEXT:    addi a4, a4, -1
 ; RV32IA-NEXT:    sll a5, a4, a0
 ; RV32IA-NEXT:    and a1, a1, a4
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    and a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a2, (a3)
@@ -3611,8 +3611,8 @@ define void @cmpxchg_i16_seq_cst_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aqrl a2, (a3)
@@ -3635,8 +3635,8 @@ define void @cmpxchg_i16_seq_cst_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-ZACAS-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a2, (a3)
@@ -3664,8 +3664,8 @@ define void @cmpxchg_i16_seq_cst_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w.aqrl a2, (a3)
@@ -3710,8 +3710,8 @@ define void @cmpxchg_i16_seq_cst_seq_cst(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV32IA-NEXT:    addi a4, a4, -1
 ; RV32IA-NEXT:    sll a5, a4, a0
 ; RV32IA-NEXT:    and a1, a1, a4
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    and a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a2, (a3)
@@ -3747,8 +3747,8 @@ define void @cmpxchg_i16_seq_cst_seq_cst(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-WMO-NEXT:    addi a4, a4, -1
 ; RV64IA-WMO-NEXT:    sllw a5, a4, a0
 ; RV64IA-WMO-NEXT:    and a1, a1, a4
-; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    and a2, a2, a4
+; RV64IA-WMO-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NEXT:    sllw a0, a2, a0
 ; RV64IA-WMO-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NEXT:    lr.w.aqrl a2, (a3)
@@ -3771,8 +3771,8 @@ define void @cmpxchg_i16_seq_cst_seq_cst(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-ZACAS-NEXT:    addi a4, a4, -1
 ; RV64IA-ZACAS-NEXT:    sllw a5, a4, a0
 ; RV64IA-ZACAS-NEXT:    and a1, a1, a4
-; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    and a2, a2, a4
+; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    sllw a0, a2, a0
 ; RV64IA-ZACAS-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a2, (a3)
@@ -3801,8 +3801,8 @@ define void @cmpxchg_i16_seq_cst_seq_cst(ptr %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-TSO-NEXT:    addi a4, a4, -1
 ; RV64IA-TSO-NEXT:    sllw a5, a4, a0
 ; RV64IA-TSO-NEXT:    and a1, a1, a4
-; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    and a2, a2, a4
+; RV64IA-TSO-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NEXT:    sllw a0, a2, a0
 ; RV64IA-TSO-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NEXT:    lr.w.aqrl a2, (a3)

diff  --git a/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll b/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll
index 35a1227b86b3a6..8534ad379ebab2 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw-discard.ll
@@ -191,10 +191,10 @@ define void @amomax_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    mv s0, a0
+; RV32-NEXT:    mv s0, a2
+; RV32-NEXT:    mv s1, a0
 ; RV32-NEXT:    lw a4, 0(a0)
 ; RV32-NEXT:    lw a5, 4(a0)
-; RV32-NEXT:    mv s1, a2
 ; RV32-NEXT:    mv s2, a1
 ; RV32-NEXT:    j .LBB11_2
 ; RV32-NEXT:  .LBB11_1: # %atomicrmw.start
@@ -204,17 +204,17 @@ define void @amomax_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
-; RV32-NEXT:    mv a0, s0
+; RV32-NEXT:    mv a0, s1
 ; RV32-NEXT:    call __atomic_compare_exchange_8
 ; RV32-NEXT:    lw a4, 8(sp)
 ; RV32-NEXT:    lw a5, 12(sp)
 ; RV32-NEXT:    bnez a0, .LBB11_6
 ; RV32-NEXT:  .LBB11_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    beq a5, s1, .LBB11_4
+; RV32-NEXT:    beq a5, s0, .LBB11_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB11_2 Depth=1
-; RV32-NEXT:    slt a0, s1, a5
+; RV32-NEXT:    slt a0, s0, a5
 ; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    bnez a0, .LBB11_1
@@ -227,7 +227,7 @@ define void @amomax_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:  .LBB11_5: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB11_2 Depth=1
 ; RV32-NEXT:    mv a2, s2
-; RV32-NEXT:    mv a3, s1
+; RV32-NEXT:    mv a3, s0
 ; RV32-NEXT:    j .LBB11_1
 ; RV32-NEXT:  .LBB11_6: # %atomicrmw.end
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -267,10 +267,10 @@ define void @amomaxu_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    mv s0, a0
+; RV32-NEXT:    mv s0, a2
+; RV32-NEXT:    mv s1, a0
 ; RV32-NEXT:    lw a4, 0(a0)
 ; RV32-NEXT:    lw a5, 4(a0)
-; RV32-NEXT:    mv s1, a2
 ; RV32-NEXT:    mv s2, a1
 ; RV32-NEXT:    j .LBB13_2
 ; RV32-NEXT:  .LBB13_1: # %atomicrmw.start
@@ -280,17 +280,17 @@ define void @amomaxu_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
-; RV32-NEXT:    mv a0, s0
+; RV32-NEXT:    mv a0, s1
 ; RV32-NEXT:    call __atomic_compare_exchange_8
 ; RV32-NEXT:    lw a4, 8(sp)
 ; RV32-NEXT:    lw a5, 12(sp)
 ; RV32-NEXT:    bnez a0, .LBB13_6
 ; RV32-NEXT:  .LBB13_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    beq a5, s1, .LBB13_4
+; RV32-NEXT:    beq a5, s0, .LBB13_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB13_2 Depth=1
-; RV32-NEXT:    sltu a0, s1, a5
+; RV32-NEXT:    sltu a0, s0, a5
 ; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    bnez a0, .LBB13_1
@@ -303,7 +303,7 @@ define void @amomaxu_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:  .LBB13_5: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB13_2 Depth=1
 ; RV32-NEXT:    mv a2, s2
-; RV32-NEXT:    mv a3, s1
+; RV32-NEXT:    mv a3, s0
 ; RV32-NEXT:    j .LBB13_1
 ; RV32-NEXT:  .LBB13_6: # %atomicrmw.end
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -343,10 +343,10 @@ define void @amomin_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    mv s0, a0
+; RV32-NEXT:    mv s0, a2
+; RV32-NEXT:    mv s1, a0
 ; RV32-NEXT:    lw a4, 0(a0)
 ; RV32-NEXT:    lw a5, 4(a0)
-; RV32-NEXT:    mv s1, a2
 ; RV32-NEXT:    mv s2, a1
 ; RV32-NEXT:    j .LBB15_2
 ; RV32-NEXT:  .LBB15_1: # %atomicrmw.start
@@ -356,17 +356,17 @@ define void @amomin_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
-; RV32-NEXT:    mv a0, s0
+; RV32-NEXT:    mv a0, s1
 ; RV32-NEXT:    call __atomic_compare_exchange_8
 ; RV32-NEXT:    lw a4, 8(sp)
 ; RV32-NEXT:    lw a5, 12(sp)
 ; RV32-NEXT:    bnez a0, .LBB15_6
 ; RV32-NEXT:  .LBB15_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    beq a5, s1, .LBB15_4
+; RV32-NEXT:    beq a5, s0, .LBB15_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB15_2 Depth=1
-; RV32-NEXT:    slt a0, s1, a5
+; RV32-NEXT:    slt a0, s0, a5
 ; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    beqz a0, .LBB15_1
@@ -379,7 +379,7 @@ define void @amomin_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:  .LBB15_5: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB15_2 Depth=1
 ; RV32-NEXT:    mv a2, s2
-; RV32-NEXT:    mv a3, s1
+; RV32-NEXT:    mv a3, s0
 ; RV32-NEXT:    j .LBB15_1
 ; RV32-NEXT:  .LBB15_6: # %atomicrmw.end
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -419,10 +419,10 @@ define void @amominu_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    mv s0, a0
+; RV32-NEXT:    mv s0, a2
+; RV32-NEXT:    mv s1, a0
 ; RV32-NEXT:    lw a4, 0(a0)
 ; RV32-NEXT:    lw a5, 4(a0)
-; RV32-NEXT:    mv s1, a2
 ; RV32-NEXT:    mv s2, a1
 ; RV32-NEXT:    j .LBB17_2
 ; RV32-NEXT:  .LBB17_1: # %atomicrmw.start
@@ -432,17 +432,17 @@ define void @amominu_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    li a4, 5
 ; RV32-NEXT:    li a5, 5
-; RV32-NEXT:    mv a0, s0
+; RV32-NEXT:    mv a0, s1
 ; RV32-NEXT:    call __atomic_compare_exchange_8
 ; RV32-NEXT:    lw a4, 8(sp)
 ; RV32-NEXT:    lw a5, 12(sp)
 ; RV32-NEXT:    bnez a0, .LBB17_6
 ; RV32-NEXT:  .LBB17_2: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    beq a5, s1, .LBB17_4
+; RV32-NEXT:    beq a5, s0, .LBB17_4
 ; RV32-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB17_2 Depth=1
-; RV32-NEXT:    sltu a0, s1, a5
+; RV32-NEXT:    sltu a0, s0, a5
 ; RV32-NEXT:    mv a2, a4
 ; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:    beqz a0, .LBB17_1
@@ -455,7 +455,7 @@ define void @amominu_d_discard(ptr %a, i64 %b) nounwind {
 ; RV32-NEXT:  .LBB17_5: # %atomicrmw.start
 ; RV32-NEXT:    # in Loop: Header=BB17_2 Depth=1
 ; RV32-NEXT:    mv a2, s2
-; RV32-NEXT:    mv a3, s1
+; RV32-NEXT:    mv a3, s0
 ; RV32-NEXT:    j .LBB17_1
 ; RV32-NEXT:  .LBB17_6: # %atomicrmw.end
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
index 469edacb391df6..81518541477a83 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
@@ -46,8 +46,8 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -76,8 +76,8 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -96,8 +96,8 @@ define i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a4, (a2)
@@ -140,8 +140,8 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -160,8 +160,8 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -190,8 +190,8 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -210,8 +210,8 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -230,8 +230,8 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -250,8 +250,8 @@ define i8 @atomicrmw_xchg_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -294,8 +294,8 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a4, (a2)
@@ -314,8 +314,8 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -344,8 +344,8 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -364,8 +364,8 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -384,8 +384,8 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -404,8 +404,8 @@ define i8 @atomicrmw_xchg_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -448,8 +448,8 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -468,8 +468,8 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -498,8 +498,8 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -518,8 +518,8 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -538,8 +538,8 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -558,8 +558,8 @@ define i8 @atomicrmw_xchg_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -602,8 +602,8 @@ define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a4, (a2)
@@ -632,8 +632,8 @@ define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -652,8 +652,8 @@ define i8 @atomicrmw_xchg_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -1636,8 +1636,8 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -1666,8 +1666,8 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -1686,8 +1686,8 @@ define i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB15_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a4, (a2)
@@ -1730,8 +1730,8 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -1750,8 +1750,8 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -1780,8 +1780,8 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -1800,8 +1800,8 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -1820,8 +1820,8 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -1840,8 +1840,8 @@ define i8 @atomicrmw_add_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB16_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -1884,8 +1884,8 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a4, (a2)
@@ -1904,8 +1904,8 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -1934,8 +1934,8 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -1954,8 +1954,8 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -1974,8 +1974,8 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -1994,8 +1994,8 @@ define i8 @atomicrmw_add_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB17_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -2038,8 +2038,8 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -2058,8 +2058,8 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -2088,8 +2088,8 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -2108,8 +2108,8 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -2128,8 +2128,8 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -2148,8 +2148,8 @@ define i8 @atomicrmw_add_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB18_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -2192,8 +2192,8 @@ define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a4, (a2)
@@ -2222,8 +2222,8 @@ define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -2242,8 +2242,8 @@ define i8 @atomicrmw_add_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB19_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -2286,8 +2286,8 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -2316,8 +2316,8 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -2336,8 +2336,8 @@ define i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB20_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a4, (a2)
@@ -2382,8 +2382,8 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -2402,8 +2402,8 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -2432,8 +2432,8 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -2452,8 +2452,8 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -2472,8 +2472,8 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -2492,8 +2492,8 @@ define i8 @atomicrmw_sub_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -2538,8 +2538,8 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a4, (a2)
@@ -2558,8 +2558,8 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -2588,8 +2588,8 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -2608,8 +2608,8 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -2628,8 +2628,8 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -2648,8 +2648,8 @@ define i8 @atomicrmw_sub_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -2694,8 +2694,8 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -2714,8 +2714,8 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -2744,8 +2744,8 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -2764,8 +2764,8 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -2784,8 +2784,8 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -2804,8 +2804,8 @@ define i8 @atomicrmw_sub_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB23_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -2850,8 +2850,8 @@ define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB24_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a4, (a2)
@@ -2880,8 +2880,8 @@ define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB24_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -2900,8 +2900,8 @@ define i8 @atomicrmw_sub_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB24_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -2946,9 +2946,9 @@ define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
+; RV32IA-NEXT:    andi a1, a1, 255
 ; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    not a3, a3
-; RV32IA-NEXT:    andi a1, a1, 255
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    or a1, a1, a3
 ; RV32IA-NEXT:    amoand.w a1, a1, (a2)
@@ -2970,9 +2970,9 @@ define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
+; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    not a3, a3
-; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -2984,9 +2984,9 @@ define i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
+; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    not a3, a3
-; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-ZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -3022,9 +3022,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
+; RV32IA-WMO-NEXT:    andi a1, a1, 255
 ; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    not a3, a3
-; RV32IA-WMO-NEXT:    andi a1, a1, 255
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    or a1, a1, a3
 ; RV32IA-WMO-NEXT:    amoand.w.aq a1, a1, (a2)
@@ -3036,9 +3036,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
+; RV32IA-TSO-NEXT:    andi a1, a1, 255
 ; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    not a3, a3
-; RV32IA-TSO-NEXT:    andi a1, a1, 255
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    or a1, a1, a3
 ; RV32IA-TSO-NEXT:    amoand.w a1, a1, (a2)
@@ -3060,9 +3060,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
+; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    not a3, a3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    amoand.w.aq a1, a1, (a2)
@@ -3074,9 +3074,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
+; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    not a3, a3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -3088,9 +3088,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
+; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    not a3, a3
-; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-ZACAS-NEXT:    amoand.w.aq a1, a1, (a2)
@@ -3102,9 +3102,9 @@ define i8 @atomicrmw_and_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
+; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    not a3, a3
-; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-ZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -3140,9 +3140,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
+; RV32IA-WMO-NEXT:    andi a1, a1, 255
 ; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    not a3, a3
-; RV32IA-WMO-NEXT:    andi a1, a1, 255
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    or a1, a1, a3
 ; RV32IA-WMO-NEXT:    amoand.w.rl a1, a1, (a2)
@@ -3154,9 +3154,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
+; RV32IA-TSO-NEXT:    andi a1, a1, 255
 ; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    not a3, a3
-; RV32IA-TSO-NEXT:    andi a1, a1, 255
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    or a1, a1, a3
 ; RV32IA-TSO-NEXT:    amoand.w a1, a1, (a2)
@@ -3178,9 +3178,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
+; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    not a3, a3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    amoand.w.rl a1, a1, (a2)
@@ -3192,9 +3192,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
+; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    not a3, a3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -3206,9 +3206,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
+; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    not a3, a3
-; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-ZACAS-NEXT:    amoand.w.rl a1, a1, (a2)
@@ -3220,9 +3220,9 @@ define i8 @atomicrmw_and_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
+; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    not a3, a3
-; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-ZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -3258,9 +3258,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
+; RV32IA-WMO-NEXT:    andi a1, a1, 255
 ; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    not a3, a3
-; RV32IA-WMO-NEXT:    andi a1, a1, 255
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    or a1, a1, a3
 ; RV32IA-WMO-NEXT:    amoand.w.aqrl a1, a1, (a2)
@@ -3272,9 +3272,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
+; RV32IA-TSO-NEXT:    andi a1, a1, 255
 ; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    not a3, a3
-; RV32IA-TSO-NEXT:    andi a1, a1, 255
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    or a1, a1, a3
 ; RV32IA-TSO-NEXT:    amoand.w a1, a1, (a2)
@@ -3296,9 +3296,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
+; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    not a3, a3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    amoand.w.aqrl a1, a1, (a2)
@@ -3310,9 +3310,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
+; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    not a3, a3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -3324,9 +3324,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
+; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    not a3, a3
-; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-ZACAS-NEXT:    amoand.w.aqrl a1, a1, (a2)
@@ -3338,9 +3338,9 @@ define i8 @atomicrmw_and_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
+; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    not a3, a3
-; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-ZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -3376,9 +3376,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
+; RV32IA-WMO-NEXT:    andi a1, a1, 255
 ; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    not a3, a3
-; RV32IA-WMO-NEXT:    andi a1, a1, 255
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:    or a1, a1, a3
 ; RV32IA-WMO-NEXT:    amoand.w.aqrl a1, a1, (a2)
@@ -3390,9 +3390,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
+; RV32IA-TSO-NEXT:    andi a1, a1, 255
 ; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    not a3, a3
-; RV32IA-TSO-NEXT:    andi a1, a1, 255
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:    or a1, a1, a3
 ; RV32IA-TSO-NEXT:    amoand.w a1, a1, (a2)
@@ -3414,9 +3414,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
+; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    not a3, a3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    amoand.w.aqrl a1, a1, (a2)
@@ -3428,9 +3428,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
+; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    not a3, a3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -3442,9 +3442,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
+; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    not a3, a3
-; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-ZACAS-NEXT:    amoand.w.aqrl a1, a1, (a2)
@@ -3456,9 +3456,9 @@ define i8 @atomicrmw_and_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
+; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    not a3, a3
-; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-ZACAS-NEXT:    amoand.w a1, a1, (a2)
@@ -3494,8 +3494,8 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB30_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -3525,8 +3525,8 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB30_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -3546,8 +3546,8 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB30_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a4, (a2)
@@ -3567,8 +3567,8 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:  .LBB30_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -3588,8 +3588,8 @@ define i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:  .LBB30_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -3653,8 +3653,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -3674,8 +3674,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -3705,8 +3705,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -3726,8 +3726,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -3747,8 +3747,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -3768,8 +3768,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -3789,8 +3789,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -3810,8 +3810,8 @@ define i8 @atomicrmw_nand_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:  .LBB31_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -3875,8 +3875,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a4, (a2)
@@ -3896,8 +3896,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -3927,8 +3927,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -3948,8 +3948,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -3969,8 +3969,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -3990,8 +3990,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -4011,8 +4011,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -4032,8 +4032,8 @@ define i8 @atomicrmw_nand_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:  .LBB32_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -4097,8 +4097,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -4118,8 +4118,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -4149,8 +4149,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -4170,8 +4170,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -4191,8 +4191,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -4212,8 +4212,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -4233,8 +4233,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -4254,8 +4254,8 @@ define i8 @atomicrmw_nand_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:  .LBB33_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -4319,8 +4319,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB34_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a4, (a2)
@@ -4350,8 +4350,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB34_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -4371,8 +4371,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB34_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -4392,8 +4392,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:  .LBB34_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZABHA-NOZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -4413,8 +4413,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:  .LBB34_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZABHA-NOZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -4436,8 +4436,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:  .LBB34_1: # %atomicrmw.start
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    and a3, a0, a1
-; RV64IA-WMO-ZABHA-ZACAS-NEXT:    not a3, a3
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    fence rw, rw
+; RV64IA-WMO-ZABHA-ZACAS-NEXT:    not a3, a3
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    slli a4, a0, 56
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    amocas.b.aqrl a0, a3, (a2)
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    srai a4, a4, 56
@@ -4452,8 +4452,8 @@ define i8 @atomicrmw_nand_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:  .LBB34_1: # %atomicrmw.start
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    and a3, a0, a1
-; RV64IA-TSO-ZABHA-ZACAS-NEXT:    not a3, a3
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    fence rw, rw
+; RV64IA-TSO-ZABHA-ZACAS-NEXT:    not a3, a3
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    slli a4, a0, 56
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    amocas.b a0, a3, (a2)
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    srai a4, a4, 56
@@ -5391,23 +5391,23 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    li a3, 255
 ; RV32IA-NEXT:    slli a1, a1, 24
+; RV32IA-NEXT:    andi a4, a0, 24
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    srai a1, a1, 24
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    xori a3, a3, 24
+; RV32IA-NEXT:    xori a4, a4, 24
 ; RV32IA-NEXT:  .LBB45_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a5, (a2)
-; RV32IA-NEXT:    and a7, a5, a4
+; RV32IA-NEXT:    and a7, a5, a3
 ; RV32IA-NEXT:    mv a6, a5
-; RV32IA-NEXT:    sll a7, a7, a3
-; RV32IA-NEXT:    sra a7, a7, a3
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a7, a1, .LBB45_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB45_1 Depth=1
 ; RV32IA-NEXT:    xor a6, a5, a1
-; RV32IA-NEXT:    and a6, a6, a4
+; RV32IA-NEXT:    and a6, a6, a3
 ; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB45_3: # in Loop: Header=BB45_1 Depth=1
 ; RV32IA-NEXT:    sc.w a6, a6, (a2)
@@ -5462,23 +5462,23 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT:    li a4, 255
-; RV64IA-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-NOZACAS-NEXT:  .LBB45_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-NOZACAS-NEXT:    bge a7, a1, .LBB45_3
 ; RV64IA-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB45_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-NOZACAS-NEXT:  .LBB45_3: # in Loop: Header=BB45_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -5491,23 +5491,23 @@ define i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS:       # %bb.0:
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-ZACAS-NEXT:    li a4, 255
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    li a3, 255
 ; RV64IA-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-ZACAS-NEXT:  .LBB45_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-ZACAS-NEXT:    mv a6, a5
-; RV64IA-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-ZACAS-NEXT:    bge a7, a1, .LBB45_3
 ; RV64IA-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB45_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-ZACAS-NEXT:  .LBB45_3: # in Loop: Header=BB45_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -5576,23 +5576,23 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    li a3, 255
 ; RV32IA-WMO-NEXT:    slli a1, a1, 24
+; RV32IA-WMO-NEXT:    andi a4, a0, 24
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    srai a1, a1, 24
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    xori a3, a3, 24
+; RV32IA-WMO-NEXT:    xori a4, a4, 24
 ; RV32IA-WMO-NEXT:  .LBB46_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a5, (a2)
-; RV32IA-WMO-NEXT:    and a7, a5, a4
+; RV32IA-WMO-NEXT:    and a7, a5, a3
 ; RV32IA-WMO-NEXT:    mv a6, a5
-; RV32IA-WMO-NEXT:    sll a7, a7, a3
-; RV32IA-WMO-NEXT:    sra a7, a7, a3
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a7, a1, .LBB46_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB46_1 Depth=1
 ; RV32IA-WMO-NEXT:    xor a6, a5, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
+; RV32IA-WMO-NEXT:    and a6, a6, a3
 ; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB46_3: # in Loop: Header=BB46_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w a6, a6, (a2)
@@ -5605,23 +5605,23 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    li a3, 255
 ; RV32IA-TSO-NEXT:    slli a1, a1, 24
+; RV32IA-TSO-NEXT:    andi a4, a0, 24
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    srai a1, a1, 24
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    xori a3, a3, 24
+; RV32IA-TSO-NEXT:    xori a4, a4, 24
 ; RV32IA-TSO-NEXT:  .LBB46_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a5, (a2)
-; RV32IA-TSO-NEXT:    and a7, a5, a4
+; RV32IA-TSO-NEXT:    and a7, a5, a3
 ; RV32IA-TSO-NEXT:    mv a6, a5
-; RV32IA-TSO-NEXT:    sll a7, a7, a3
-; RV32IA-TSO-NEXT:    sra a7, a7, a3
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a7, a1, .LBB46_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB46_1 Depth=1
 ; RV32IA-TSO-NEXT:    xor a6, a5, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
+; RV32IA-TSO-NEXT:    and a6, a6, a3
 ; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB46_3: # in Loop: Header=BB46_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
@@ -5676,23 +5676,23 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB46_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a5, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a7, a1, .LBB46_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB46_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB46_3: # in Loop: Header=BB46_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -5705,23 +5705,23 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB46_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a7, a1, .LBB46_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB46_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB46_3: # in Loop: Header=BB46_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -5734,23 +5734,23 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB46_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a5, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a7, a1, .LBB46_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB46_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB46_3: # in Loop: Header=BB46_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -5763,23 +5763,23 @@ define i8 @atomicrmw_max_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB46_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a7, a1, .LBB46_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB46_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB46_3: # in Loop: Header=BB46_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -5848,23 +5848,23 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    li a3, 255
 ; RV32IA-WMO-NEXT:    slli a1, a1, 24
+; RV32IA-WMO-NEXT:    andi a4, a0, 24
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    srai a1, a1, 24
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    xori a3, a3, 24
+; RV32IA-WMO-NEXT:    xori a4, a4, 24
 ; RV32IA-WMO-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a5, (a2)
-; RV32IA-WMO-NEXT:    and a7, a5, a4
+; RV32IA-WMO-NEXT:    and a7, a5, a3
 ; RV32IA-WMO-NEXT:    mv a6, a5
-; RV32IA-WMO-NEXT:    sll a7, a7, a3
-; RV32IA-WMO-NEXT:    sra a7, a7, a3
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a7, a1, .LBB47_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB47_1 Depth=1
 ; RV32IA-WMO-NEXT:    xor a6, a5, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
+; RV32IA-WMO-NEXT:    and a6, a6, a3
 ; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB47_3: # in Loop: Header=BB47_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w.rl a6, a6, (a2)
@@ -5877,23 +5877,23 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    li a3, 255
 ; RV32IA-TSO-NEXT:    slli a1, a1, 24
+; RV32IA-TSO-NEXT:    andi a4, a0, 24
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    srai a1, a1, 24
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    xori a3, a3, 24
+; RV32IA-TSO-NEXT:    xori a4, a4, 24
 ; RV32IA-TSO-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a5, (a2)
-; RV32IA-TSO-NEXT:    and a7, a5, a4
+; RV32IA-TSO-NEXT:    and a7, a5, a3
 ; RV32IA-TSO-NEXT:    mv a6, a5
-; RV32IA-TSO-NEXT:    sll a7, a7, a3
-; RV32IA-TSO-NEXT:    sra a7, a7, a3
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a7, a1, .LBB47_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB47_1 Depth=1
 ; RV32IA-TSO-NEXT:    xor a6, a5, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
+; RV32IA-TSO-NEXT:    and a6, a6, a3
 ; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB47_3: # in Loop: Header=BB47_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
@@ -5948,23 +5948,23 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a7, a1, .LBB47_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB47_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB47_3: # in Loop: Header=BB47_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -5977,23 +5977,23 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a7, a1, .LBB47_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB47_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB47_3: # in Loop: Header=BB47_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -6006,23 +6006,23 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a7, a1, .LBB47_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB47_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB47_3: # in Loop: Header=BB47_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -6035,23 +6035,23 @@ define i8 @atomicrmw_max_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a7, a1, .LBB47_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB47_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB47_3: # in Loop: Header=BB47_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -6120,23 +6120,23 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    li a3, 255
 ; RV32IA-WMO-NEXT:    slli a1, a1, 24
+; RV32IA-WMO-NEXT:    andi a4, a0, 24
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    srai a1, a1, 24
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    xori a3, a3, 24
+; RV32IA-WMO-NEXT:    xori a4, a4, 24
 ; RV32IA-WMO-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a5, (a2)
-; RV32IA-WMO-NEXT:    and a7, a5, a4
+; RV32IA-WMO-NEXT:    and a7, a5, a3
 ; RV32IA-WMO-NEXT:    mv a6, a5
-; RV32IA-WMO-NEXT:    sll a7, a7, a3
-; RV32IA-WMO-NEXT:    sra a7, a7, a3
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a7, a1, .LBB48_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB48_1 Depth=1
 ; RV32IA-WMO-NEXT:    xor a6, a5, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
+; RV32IA-WMO-NEXT:    and a6, a6, a3
 ; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB48_3: # in Loop: Header=BB48_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w.rl a6, a6, (a2)
@@ -6149,23 +6149,23 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    li a3, 255
 ; RV32IA-TSO-NEXT:    slli a1, a1, 24
+; RV32IA-TSO-NEXT:    andi a4, a0, 24
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    srai a1, a1, 24
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    xori a3, a3, 24
+; RV32IA-TSO-NEXT:    xori a4, a4, 24
 ; RV32IA-TSO-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a5, (a2)
-; RV32IA-TSO-NEXT:    and a7, a5, a4
+; RV32IA-TSO-NEXT:    and a7, a5, a3
 ; RV32IA-TSO-NEXT:    mv a6, a5
-; RV32IA-TSO-NEXT:    sll a7, a7, a3
-; RV32IA-TSO-NEXT:    sra a7, a7, a3
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a7, a1, .LBB48_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB48_1 Depth=1
 ; RV32IA-TSO-NEXT:    xor a6, a5, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
+; RV32IA-TSO-NEXT:    and a6, a6, a3
 ; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB48_3: # in Loop: Header=BB48_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
@@ -6220,23 +6220,23 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a5, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a7, a1, .LBB48_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB48_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB48_3: # in Loop: Header=BB48_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -6249,23 +6249,23 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a7, a1, .LBB48_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB48_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB48_3: # in Loop: Header=BB48_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -6278,23 +6278,23 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a5, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a7, a1, .LBB48_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB48_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB48_3: # in Loop: Header=BB48_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -6307,23 +6307,23 @@ define i8 @atomicrmw_max_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a7, a1, .LBB48_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB48_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB48_3: # in Loop: Header=BB48_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -6392,23 +6392,23 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    li a3, 255
 ; RV32IA-NEXT:    slli a1, a1, 24
+; RV32IA-NEXT:    andi a4, a0, 24
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    srai a1, a1, 24
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    xori a3, a3, 24
+; RV32IA-NEXT:    xori a4, a4, 24
 ; RV32IA-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a5, (a2)
-; RV32IA-NEXT:    and a7, a5, a4
+; RV32IA-NEXT:    and a7, a5, a3
 ; RV32IA-NEXT:    mv a6, a5
-; RV32IA-NEXT:    sll a7, a7, a3
-; RV32IA-NEXT:    sra a7, a7, a3
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a7, a1, .LBB49_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB49_1 Depth=1
 ; RV32IA-NEXT:    xor a6, a5, a1
-; RV32IA-NEXT:    and a6, a6, a4
+; RV32IA-NEXT:    and a6, a6, a3
 ; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB49_3: # in Loop: Header=BB49_1 Depth=1
 ; RV32IA-NEXT:    sc.w.rl a6, a6, (a2)
@@ -6463,23 +6463,23 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT:    li a4, 255
-; RV64IA-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-NOZACAS-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a5, (a2)
-; RV64IA-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-NOZACAS-NEXT:    bge a7, a1, .LBB49_3
 ; RV64IA-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB49_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-NOZACAS-NEXT:  .LBB49_3: # in Loop: Header=BB49_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -6492,23 +6492,23 @@ define i8 @atomicrmw_max_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS:       # %bb.0:
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-ZACAS-NEXT:    li a4, 255
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    li a3, 255
 ; RV64IA-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-ZACAS-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a5, (a2)
-; RV64IA-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-ZACAS-NEXT:    mv a6, a5
-; RV64IA-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-ZACAS-NEXT:    bge a7, a1, .LBB49_3
 ; RV64IA-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB49_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-ZACAS-NEXT:  .LBB49_3: # in Loop: Header=BB49_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -6577,23 +6577,23 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    li a3, 255
 ; RV32IA-NEXT:    slli a1, a1, 24
+; RV32IA-NEXT:    andi a4, a0, 24
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    srai a1, a1, 24
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    xori a3, a3, 24
+; RV32IA-NEXT:    xori a4, a4, 24
 ; RV32IA-NEXT:  .LBB50_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a5, (a2)
-; RV32IA-NEXT:    and a7, a5, a4
+; RV32IA-NEXT:    and a7, a5, a3
 ; RV32IA-NEXT:    mv a6, a5
-; RV32IA-NEXT:    sll a7, a7, a3
-; RV32IA-NEXT:    sra a7, a7, a3
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a1, a7, .LBB50_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB50_1 Depth=1
 ; RV32IA-NEXT:    xor a6, a5, a1
-; RV32IA-NEXT:    and a6, a6, a4
+; RV32IA-NEXT:    and a6, a6, a3
 ; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB50_3: # in Loop: Header=BB50_1 Depth=1
 ; RV32IA-NEXT:    sc.w a6, a6, (a2)
@@ -6648,23 +6648,23 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT:    li a4, 255
-; RV64IA-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-NOZACAS-NEXT:  .LBB50_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-NOZACAS-NEXT:    bge a1, a7, .LBB50_3
 ; RV64IA-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB50_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-NOZACAS-NEXT:  .LBB50_3: # in Loop: Header=BB50_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -6677,23 +6677,23 @@ define i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS:       # %bb.0:
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-ZACAS-NEXT:    li a4, 255
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    li a3, 255
 ; RV64IA-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-ZACAS-NEXT:  .LBB50_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-ZACAS-NEXT:    mv a6, a5
-; RV64IA-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-ZACAS-NEXT:    bge a1, a7, .LBB50_3
 ; RV64IA-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB50_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-ZACAS-NEXT:  .LBB50_3: # in Loop: Header=BB50_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -6762,23 +6762,23 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    li a3, 255
 ; RV32IA-WMO-NEXT:    slli a1, a1, 24
+; RV32IA-WMO-NEXT:    andi a4, a0, 24
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    srai a1, a1, 24
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    xori a3, a3, 24
+; RV32IA-WMO-NEXT:    xori a4, a4, 24
 ; RV32IA-WMO-NEXT:  .LBB51_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a5, (a2)
-; RV32IA-WMO-NEXT:    and a7, a5, a4
+; RV32IA-WMO-NEXT:    and a7, a5, a3
 ; RV32IA-WMO-NEXT:    mv a6, a5
-; RV32IA-WMO-NEXT:    sll a7, a7, a3
-; RV32IA-WMO-NEXT:    sra a7, a7, a3
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a1, a7, .LBB51_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB51_1 Depth=1
 ; RV32IA-WMO-NEXT:    xor a6, a5, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
+; RV32IA-WMO-NEXT:    and a6, a6, a3
 ; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB51_3: # in Loop: Header=BB51_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w a6, a6, (a2)
@@ -6791,23 +6791,23 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    li a3, 255
 ; RV32IA-TSO-NEXT:    slli a1, a1, 24
+; RV32IA-TSO-NEXT:    andi a4, a0, 24
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    srai a1, a1, 24
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    xori a3, a3, 24
+; RV32IA-TSO-NEXT:    xori a4, a4, 24
 ; RV32IA-TSO-NEXT:  .LBB51_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a5, (a2)
-; RV32IA-TSO-NEXT:    and a7, a5, a4
+; RV32IA-TSO-NEXT:    and a7, a5, a3
 ; RV32IA-TSO-NEXT:    mv a6, a5
-; RV32IA-TSO-NEXT:    sll a7, a7, a3
-; RV32IA-TSO-NEXT:    sra a7, a7, a3
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a1, a7, .LBB51_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB51_1 Depth=1
 ; RV32IA-TSO-NEXT:    xor a6, a5, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
+; RV32IA-TSO-NEXT:    and a6, a6, a3
 ; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB51_3: # in Loop: Header=BB51_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
@@ -6862,23 +6862,23 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB51_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a5, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a1, a7, .LBB51_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB51_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB51_3: # in Loop: Header=BB51_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -6891,23 +6891,23 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB51_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a1, a7, .LBB51_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB51_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB51_3: # in Loop: Header=BB51_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -6920,23 +6920,23 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB51_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a5, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a1, a7, .LBB51_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB51_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB51_3: # in Loop: Header=BB51_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -6949,23 +6949,23 @@ define i8 @atomicrmw_min_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB51_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a1, a7, .LBB51_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB51_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB51_3: # in Loop: Header=BB51_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -7034,23 +7034,23 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    li a3, 255
 ; RV32IA-WMO-NEXT:    slli a1, a1, 24
+; RV32IA-WMO-NEXT:    andi a4, a0, 24
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    srai a1, a1, 24
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    xori a3, a3, 24
+; RV32IA-WMO-NEXT:    xori a4, a4, 24
 ; RV32IA-WMO-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a5, (a2)
-; RV32IA-WMO-NEXT:    and a7, a5, a4
+; RV32IA-WMO-NEXT:    and a7, a5, a3
 ; RV32IA-WMO-NEXT:    mv a6, a5
-; RV32IA-WMO-NEXT:    sll a7, a7, a3
-; RV32IA-WMO-NEXT:    sra a7, a7, a3
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a1, a7, .LBB52_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB52_1 Depth=1
 ; RV32IA-WMO-NEXT:    xor a6, a5, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
+; RV32IA-WMO-NEXT:    and a6, a6, a3
 ; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB52_3: # in Loop: Header=BB52_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w.rl a6, a6, (a2)
@@ -7063,23 +7063,23 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    li a3, 255
 ; RV32IA-TSO-NEXT:    slli a1, a1, 24
+; RV32IA-TSO-NEXT:    andi a4, a0, 24
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    srai a1, a1, 24
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    xori a3, a3, 24
+; RV32IA-TSO-NEXT:    xori a4, a4, 24
 ; RV32IA-TSO-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a5, (a2)
-; RV32IA-TSO-NEXT:    and a7, a5, a4
+; RV32IA-TSO-NEXT:    and a7, a5, a3
 ; RV32IA-TSO-NEXT:    mv a6, a5
-; RV32IA-TSO-NEXT:    sll a7, a7, a3
-; RV32IA-TSO-NEXT:    sra a7, a7, a3
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a1, a7, .LBB52_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB52_1 Depth=1
 ; RV32IA-TSO-NEXT:    xor a6, a5, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
+; RV32IA-TSO-NEXT:    and a6, a6, a3
 ; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB52_3: # in Loop: Header=BB52_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
@@ -7134,23 +7134,23 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a1, a7, .LBB52_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB52_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB52_3: # in Loop: Header=BB52_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -7163,23 +7163,23 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a1, a7, .LBB52_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB52_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB52_3: # in Loop: Header=BB52_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -7192,23 +7192,23 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a1, a7, .LBB52_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB52_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB52_3: # in Loop: Header=BB52_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -7221,23 +7221,23 @@ define i8 @atomicrmw_min_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB52_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a1, a7, .LBB52_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB52_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB52_3: # in Loop: Header=BB52_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -7306,23 +7306,23 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    li a4, 255
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    li a3, 255
 ; RV32IA-WMO-NEXT:    slli a1, a1, 24
+; RV32IA-WMO-NEXT:    andi a4, a0, 24
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    srai a1, a1, 24
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    xori a3, a3, 24
+; RV32IA-WMO-NEXT:    xori a4, a4, 24
 ; RV32IA-WMO-NEXT:  .LBB53_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a5, (a2)
-; RV32IA-WMO-NEXT:    and a7, a5, a4
+; RV32IA-WMO-NEXT:    and a7, a5, a3
 ; RV32IA-WMO-NEXT:    mv a6, a5
-; RV32IA-WMO-NEXT:    sll a7, a7, a3
-; RV32IA-WMO-NEXT:    sra a7, a7, a3
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a1, a7, .LBB53_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB53_1 Depth=1
 ; RV32IA-WMO-NEXT:    xor a6, a5, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
+; RV32IA-WMO-NEXT:    and a6, a6, a3
 ; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB53_3: # in Loop: Header=BB53_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w.rl a6, a6, (a2)
@@ -7335,23 +7335,23 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    li a4, 255
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    li a3, 255
 ; RV32IA-TSO-NEXT:    slli a1, a1, 24
+; RV32IA-TSO-NEXT:    andi a4, a0, 24
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    srai a1, a1, 24
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    xori a3, a3, 24
+; RV32IA-TSO-NEXT:    xori a4, a4, 24
 ; RV32IA-TSO-NEXT:  .LBB53_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a5, (a2)
-; RV32IA-TSO-NEXT:    and a7, a5, a4
+; RV32IA-TSO-NEXT:    and a7, a5, a3
 ; RV32IA-TSO-NEXT:    mv a6, a5
-; RV32IA-TSO-NEXT:    sll a7, a7, a3
-; RV32IA-TSO-NEXT:    sra a7, a7, a3
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a1, a7, .LBB53_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB53_1 Depth=1
 ; RV32IA-TSO-NEXT:    xor a6, a5, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
+; RV32IA-TSO-NEXT:    and a6, a6, a3
 ; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB53_3: # in Loop: Header=BB53_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
@@ -7406,23 +7406,23 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB53_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a5, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a1, a7, .LBB53_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB53_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB53_3: # in Loop: Header=BB53_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -7435,23 +7435,23 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB53_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a1, a7, .LBB53_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB53_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB53_3: # in Loop: Header=BB53_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -7464,23 +7464,23 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    li a4, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-WMO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-WMO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB53_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a5, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a1, a7, .LBB53_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB53_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB53_3: # in Loop: Header=BB53_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -7493,23 +7493,23 @@ define i8 @atomicrmw_min_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    li a4, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-TSO-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-TSO-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB53_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a1, a7, .LBB53_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB53_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB53_3: # in Loop: Header=BB53_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
@@ -7578,23 +7578,23 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    li a3, 255
 ; RV32IA-NEXT:    slli a1, a1, 24
+; RV32IA-NEXT:    andi a4, a0, 24
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    srai a1, a1, 24
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    xori a3, a3, 24
+; RV32IA-NEXT:    xori a4, a4, 24
 ; RV32IA-NEXT:  .LBB54_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a5, (a2)
-; RV32IA-NEXT:    and a7, a5, a4
+; RV32IA-NEXT:    and a7, a5, a3
 ; RV32IA-NEXT:    mv a6, a5
-; RV32IA-NEXT:    sll a7, a7, a3
-; RV32IA-NEXT:    sra a7, a7, a3
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a1, a7, .LBB54_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB54_1 Depth=1
 ; RV32IA-NEXT:    xor a6, a5, a1
-; RV32IA-NEXT:    and a6, a6, a4
+; RV32IA-NEXT:    and a6, a6, a3
 ; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB54_3: # in Loop: Header=BB54_1 Depth=1
 ; RV32IA-NEXT:    sc.w.rl a6, a6, (a2)
@@ -7649,23 +7649,23 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT:    li a4, 255
-; RV64IA-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT:    li a3, 255
 ; RV64IA-NOZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-NOZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-NOZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-NOZACAS-NEXT:  .LBB54_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a5, (a2)
-; RV64IA-NOZACAS-NEXT:    and a7, a5, a4
+; RV64IA-NOZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-NOZACAS-NEXT:    mv a6, a5
-; RV64IA-NOZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-NOZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-NOZACAS-NEXT:    bge a1, a7, .LBB54_3
 ; RV64IA-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB54_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-NOZACAS-NEXT:    and a6, a6, a4
+; RV64IA-NOZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-NOZACAS-NEXT:  .LBB54_3: # in Loop: Header=BB54_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -7678,23 +7678,23 @@ define i8 @atomicrmw_min_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS:       # %bb.0:
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-ZACAS-NEXT:    li a4, 255
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    li a3, 255
 ; RV64IA-ZACAS-NEXT:    slli a1, a1, 56
+; RV64IA-ZACAS-NEXT:    andi a4, a0, 24
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    srai a1, a1, 56
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT:    xori a3, a3, 56
+; RV64IA-ZACAS-NEXT:    xori a4, a4, 56
 ; RV64IA-ZACAS-NEXT:  .LBB54_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a5, (a2)
-; RV64IA-ZACAS-NEXT:    and a7, a5, a4
+; RV64IA-ZACAS-NEXT:    and a7, a5, a3
 ; RV64IA-ZACAS-NEXT:    mv a6, a5
-; RV64IA-ZACAS-NEXT:    sll a7, a7, a3
-; RV64IA-ZACAS-NEXT:    sra a7, a7, a3
+; RV64IA-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-ZACAS-NEXT:    bge a1, a7, .LBB54_3
 ; RV64IA-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB54_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    xor a6, a5, a1
-; RV64IA-ZACAS-NEXT:    and a6, a6, a4
+; RV64IA-ZACAS-NEXT:    and a6, a6, a3
 ; RV64IA-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-ZACAS-NEXT:  .LBB54_3: # in Loop: Header=BB54_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
@@ -7762,8 +7762,8 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB55_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -7826,8 +7826,8 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB55_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -7850,8 +7850,8 @@ define i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB55_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a4, (a2)
@@ -7928,8 +7928,8 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB56_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -7952,8 +7952,8 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB56_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -8016,8 +8016,8 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB56_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -8040,8 +8040,8 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB56_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -8064,8 +8064,8 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB56_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -8088,8 +8088,8 @@ define i8 @atomicrmw_umax_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB56_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -8166,8 +8166,8 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB57_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a4, (a2)
@@ -8190,8 +8190,8 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB57_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -8254,8 +8254,8 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB57_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -8278,8 +8278,8 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB57_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -8302,8 +8302,8 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB57_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -8326,8 +8326,8 @@ define i8 @atomicrmw_umax_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB57_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -8404,8 +8404,8 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB58_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -8428,8 +8428,8 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB58_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -8492,8 +8492,8 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB58_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -8516,8 +8516,8 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB58_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -8540,8 +8540,8 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB58_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -8564,8 +8564,8 @@ define i8 @atomicrmw_umax_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB58_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -8642,8 +8642,8 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB59_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a4, (a2)
@@ -8706,8 +8706,8 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB59_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -8730,8 +8730,8 @@ define i8 @atomicrmw_umax_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB59_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -8808,8 +8808,8 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB60_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -8872,8 +8872,8 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB60_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -8896,8 +8896,8 @@ define i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB60_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w a4, (a2)
@@ -8974,8 +8974,8 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB61_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -8998,8 +8998,8 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB61_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -9062,8 +9062,8 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB61_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -9086,8 +9086,8 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB61_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -9110,8 +9110,8 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB61_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -9134,8 +9134,8 @@ define i8 @atomicrmw_umin_i8_acquire(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB61_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -9212,8 +9212,8 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB62_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w a4, (a2)
@@ -9236,8 +9236,8 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB62_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -9300,8 +9300,8 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB62_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -9324,8 +9324,8 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB62_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -9348,8 +9348,8 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB62_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -9372,8 +9372,8 @@ define i8 @atomicrmw_umin_i8_release(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB62_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -9450,8 +9450,8 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
 ; RV32IA-WMO-NEXT:    li a3, 255
-; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    andi a1, a1, 255
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
 ; RV32IA-WMO-NEXT:  .LBB63_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-WMO-NEXT:    lr.w.aq a4, (a2)
@@ -9474,8 +9474,8 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
 ; RV32IA-TSO-NEXT:    li a3, 255
-; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    andi a1, a1, 255
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
 ; RV32IA-TSO-NEXT:  .LBB63_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-TSO-NEXT:    lr.w a4, (a2)
@@ -9538,8 +9538,8 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB63_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -9562,8 +9562,8 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-NOZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB63_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    lr.w a4, (a2)
@@ -9586,8 +9586,8 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-WMO-ZACAS-NEXT:    li a3, 255
-; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB63_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a4, (a2)
@@ -9610,8 +9610,8 @@ define i8 @atomicrmw_umin_i8_acq_rel(ptr %a, i8 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-TSO-ZACAS-NEXT:    li a3, 255
-; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB63_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    lr.w a4, (a2)
@@ -9688,8 +9688,8 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB64_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w.aqrl a4, (a2)
@@ -9752,8 +9752,8 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-NOZACAS-NEXT:    li a3, 255
-; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-NOZACAS-NEXT:  .LBB64_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -9776,8 +9776,8 @@ define i8 @atomicrmw_umin_i8_seq_cst(ptr %a, i8 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
 ; RV64IA-ZACAS-NEXT:    li a3, 255
-; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    andi a1, a1, 255
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
 ; RV64IA-ZACAS-NEXT:  .LBB64_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-ZACAS-NEXT:    lr.w.aqrl a4, (a2)
@@ -12862,10 +12862,10 @@ define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    sll a4, a3, a0
-; RV32IA-NEXT:    not a4, a4
 ; RV32IA-NEXT:    and a1, a1, a3
+; RV32IA-NEXT:    not a3, a4
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    or a1, a1, a4
+; RV32IA-NEXT:    or a1, a1, a3
 ; RV32IA-NEXT:    amoand.w a1, a1, (a2)
 ; RV32IA-NEXT:    srl a0, a1, a0
 ; RV32IA-NEXT:    ret
@@ -12887,10 +12887,10 @@ define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-NOZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-NOZACAS-NEXT:    not a4, a4
 ; RV64IA-NOZACAS-NEXT:    and a1, a1, a3
+; RV64IA-NOZACAS-NEXT:    not a3, a4
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT:    or a1, a1, a4
+; RV64IA-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-NOZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-NOZACAS-NEXT:    ret
@@ -12902,10 +12902,10 @@ define i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-ZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-ZACAS-NEXT:    not a4, a4
 ; RV64IA-ZACAS-NEXT:    and a1, a1, a3
+; RV64IA-ZACAS-NEXT:    not a3, a4
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT:    or a1, a1, a4
+; RV64IA-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-ZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-ZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-ZACAS-NEXT:    ret
@@ -12941,10 +12941,10 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    sll a4, a3, a0
-; RV32IA-WMO-NEXT:    not a4, a4
 ; RV32IA-WMO-NEXT:    and a1, a1, a3
+; RV32IA-WMO-NEXT:    not a3, a4
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    or a1, a1, a4
+; RV32IA-WMO-NEXT:    or a1, a1, a3
 ; RV32IA-WMO-NEXT:    amoand.w.aq a1, a1, (a2)
 ; RV32IA-WMO-NEXT:    srl a0, a1, a0
 ; RV32IA-WMO-NEXT:    ret
@@ -12956,10 +12956,10 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    sll a4, a3, a0
-; RV32IA-TSO-NEXT:    not a4, a4
 ; RV32IA-TSO-NEXT:    and a1, a1, a3
+; RV32IA-TSO-NEXT:    not a3, a4
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    or a1, a1, a4
+; RV32IA-TSO-NEXT:    or a1, a1, a3
 ; RV32IA-TSO-NEXT:    amoand.w a1, a1, (a2)
 ; RV32IA-TSO-NEXT:    srl a0, a1, a0
 ; RV32IA-TSO-NEXT:    ret
@@ -12981,10 +12981,10 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-WMO-NOZACAS-NEXT:    not a4, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    and a1, a1, a3
+; RV64IA-WMO-NOZACAS-NEXT:    not a3, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a4
+; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    amoand.w.aq a1, a1, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
@@ -12996,10 +12996,10 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-TSO-NOZACAS-NEXT:    not a4, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    and a1, a1, a3
+; RV64IA-TSO-NOZACAS-NEXT:    not a3, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a4
+; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
@@ -13011,10 +13011,10 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-WMO-ZACAS-NEXT:    not a4, a4
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a3
+; RV64IA-WMO-ZACAS-NEXT:    not a3, a4
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a4
+; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-ZACAS-NEXT:    amoand.w.aq a1, a1, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
@@ -13026,10 +13026,10 @@ define i16 @atomicrmw_and_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-TSO-ZACAS-NEXT:    not a4, a4
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a3
+; RV64IA-TSO-ZACAS-NEXT:    not a3, a4
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a4
+; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-ZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
@@ -13065,10 +13065,10 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    sll a4, a3, a0
-; RV32IA-WMO-NEXT:    not a4, a4
 ; RV32IA-WMO-NEXT:    and a1, a1, a3
+; RV32IA-WMO-NEXT:    not a3, a4
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    or a1, a1, a4
+; RV32IA-WMO-NEXT:    or a1, a1, a3
 ; RV32IA-WMO-NEXT:    amoand.w.rl a1, a1, (a2)
 ; RV32IA-WMO-NEXT:    srl a0, a1, a0
 ; RV32IA-WMO-NEXT:    ret
@@ -13080,10 +13080,10 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    sll a4, a3, a0
-; RV32IA-TSO-NEXT:    not a4, a4
 ; RV32IA-TSO-NEXT:    and a1, a1, a3
+; RV32IA-TSO-NEXT:    not a3, a4
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    or a1, a1, a4
+; RV32IA-TSO-NEXT:    or a1, a1, a3
 ; RV32IA-TSO-NEXT:    amoand.w a1, a1, (a2)
 ; RV32IA-TSO-NEXT:    srl a0, a1, a0
 ; RV32IA-TSO-NEXT:    ret
@@ -13105,10 +13105,10 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-WMO-NOZACAS-NEXT:    not a4, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    and a1, a1, a3
+; RV64IA-WMO-NOZACAS-NEXT:    not a3, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a4
+; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    amoand.w.rl a1, a1, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
@@ -13120,10 +13120,10 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-TSO-NOZACAS-NEXT:    not a4, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    and a1, a1, a3
+; RV64IA-TSO-NOZACAS-NEXT:    not a3, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a4
+; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
@@ -13135,10 +13135,10 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-WMO-ZACAS-NEXT:    not a4, a4
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a3
+; RV64IA-WMO-ZACAS-NEXT:    not a3, a4
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a4
+; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-ZACAS-NEXT:    amoand.w.rl a1, a1, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
@@ -13150,10 +13150,10 @@ define i16 @atomicrmw_and_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-TSO-ZACAS-NEXT:    not a4, a4
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a3
+; RV64IA-TSO-ZACAS-NEXT:    not a3, a4
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a4
+; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-ZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
@@ -13189,10 +13189,10 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    sll a4, a3, a0
-; RV32IA-WMO-NEXT:    not a4, a4
 ; RV32IA-WMO-NEXT:    and a1, a1, a3
+; RV32IA-WMO-NEXT:    not a3, a4
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    or a1, a1, a4
+; RV32IA-WMO-NEXT:    or a1, a1, a3
 ; RV32IA-WMO-NEXT:    amoand.w.aqrl a1, a1, (a2)
 ; RV32IA-WMO-NEXT:    srl a0, a1, a0
 ; RV32IA-WMO-NEXT:    ret
@@ -13204,10 +13204,10 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    sll a4, a3, a0
-; RV32IA-TSO-NEXT:    not a4, a4
 ; RV32IA-TSO-NEXT:    and a1, a1, a3
+; RV32IA-TSO-NEXT:    not a3, a4
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    or a1, a1, a4
+; RV32IA-TSO-NEXT:    or a1, a1, a3
 ; RV32IA-TSO-NEXT:    amoand.w a1, a1, (a2)
 ; RV32IA-TSO-NEXT:    srl a0, a1, a0
 ; RV32IA-TSO-NEXT:    ret
@@ -13229,10 +13229,10 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-WMO-NOZACAS-NEXT:    not a4, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    and a1, a1, a3
+; RV64IA-WMO-NOZACAS-NEXT:    not a3, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a4
+; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    amoand.w.aqrl a1, a1, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
@@ -13244,10 +13244,10 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-TSO-NOZACAS-NEXT:    not a4, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    and a1, a1, a3
+; RV64IA-TSO-NOZACAS-NEXT:    not a3, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a4
+; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
@@ -13259,10 +13259,10 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-WMO-ZACAS-NEXT:    not a4, a4
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a3
+; RV64IA-WMO-ZACAS-NEXT:    not a3, a4
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a4
+; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-ZACAS-NEXT:    amoand.w.aqrl a1, a1, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
@@ -13274,10 +13274,10 @@ define i16 @atomicrmw_and_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-TSO-ZACAS-NEXT:    not a4, a4
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a3
+; RV64IA-TSO-ZACAS-NEXT:    not a3, a4
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a4
+; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-ZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
@@ -13313,10 +13313,10 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    sll a4, a3, a0
-; RV32IA-WMO-NEXT:    not a4, a4
 ; RV32IA-WMO-NEXT:    and a1, a1, a3
+; RV32IA-WMO-NEXT:    not a3, a4
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    or a1, a1, a4
+; RV32IA-WMO-NEXT:    or a1, a1, a3
 ; RV32IA-WMO-NEXT:    amoand.w.aqrl a1, a1, (a2)
 ; RV32IA-WMO-NEXT:    srl a0, a1, a0
 ; RV32IA-WMO-NEXT:    ret
@@ -13328,10 +13328,10 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    sll a4, a3, a0
-; RV32IA-TSO-NEXT:    not a4, a4
 ; RV32IA-TSO-NEXT:    and a1, a1, a3
+; RV32IA-TSO-NEXT:    not a3, a4
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    or a1, a1, a4
+; RV32IA-TSO-NEXT:    or a1, a1, a3
 ; RV32IA-TSO-NEXT:    amoand.w a1, a1, (a2)
 ; RV32IA-TSO-NEXT:    srl a0, a1, a0
 ; RV32IA-TSO-NEXT:    ret
@@ -13353,10 +13353,10 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-WMO-NOZACAS-NEXT:    not a4, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    and a1, a1, a3
+; RV64IA-WMO-NOZACAS-NEXT:    not a3, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a4
+; RV64IA-WMO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-NOZACAS-NEXT:    amoand.w.aqrl a1, a1, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
@@ -13368,10 +13368,10 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-TSO-NOZACAS-NEXT:    not a4, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    and a1, a1, a3
+; RV64IA-TSO-NOZACAS-NEXT:    not a3, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a4
+; RV64IA-TSO-NOZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-NOZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
@@ -13383,10 +13383,10 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-WMO-ZACAS-NEXT:    not a4, a4
 ; RV64IA-WMO-ZACAS-NEXT:    and a1, a1, a3
+; RV64IA-WMO-ZACAS-NEXT:    not a3, a4
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a4
+; RV64IA-WMO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-WMO-ZACAS-NEXT:    amoand.w.aqrl a1, a1, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
@@ -13398,10 +13398,10 @@ define i16 @atomicrmw_and_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a3, a0
-; RV64IA-TSO-ZACAS-NEXT:    not a4, a4
 ; RV64IA-TSO-ZACAS-NEXT:    and a1, a1, a3
+; RV64IA-TSO-ZACAS-NEXT:    not a3, a4
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a4
+; RV64IA-TSO-ZACAS-NEXT:    or a1, a1, a3
 ; RV64IA-TSO-ZACAS-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a1, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
@@ -14411,8 +14411,8 @@ define i16 @atomicrmw_nand_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:  .LBB99_1: # %atomicrmw.start
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    and a3, a0, a1
-; RV64IA-WMO-ZABHA-ZACAS-NEXT:    not a3, a3
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    fence rw, rw
+; RV64IA-WMO-ZABHA-ZACAS-NEXT:    not a3, a3
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    slli a4, a0, 48
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    amocas.h.aqrl a0, a3, (a2)
 ; RV64IA-WMO-ZABHA-ZACAS-NEXT:    srai a4, a4, 48
@@ -14427,8 +14427,8 @@ define i16 @atomicrmw_nand_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:  .LBB99_1: # %atomicrmw.start
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    and a3, a0, a1
-; RV64IA-TSO-ZABHA-ZACAS-NEXT:    not a3, a3
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    fence rw, rw
+; RV64IA-TSO-ZABHA-ZACAS-NEXT:    not a3, a3
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    slli a4, a0, 48
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    amocas.h a0, a3, (a2)
 ; RV64IA-TSO-ZABHA-ZACAS-NEXT:    srai a4, a4, 48
@@ -15420,31 +15420,31 @@ define i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    lui a4, 16
-; RV32IA-NEXT:    addi a4, a4, -1
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    slli a1, a1, 16
+; RV32IA-NEXT:    li a4, 16
+; RV32IA-NEXT:    andi a5, a0, 24
+; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    srai a1, a1, 16
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    li a5, 16
-; RV32IA-NEXT:    sub a5, a5, a3
+; RV32IA-NEXT:    sub a4, a4, a5
 ; RV32IA-NEXT:  .LBB110_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    lr.w a3, (a2)
-; RV32IA-NEXT:    and a7, a3, a4
-; RV32IA-NEXT:    mv a6, a3
-; RV32IA-NEXT:    sll a7, a7, a5
-; RV32IA-NEXT:    sra a7, a7, a5
+; RV32IA-NEXT:    lr.w a5, (a2)
+; RV32IA-NEXT:    and a7, a5, a3
+; RV32IA-NEXT:    mv a6, a5
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a7, a1, .LBB110_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB110_1 Depth=1
-; RV32IA-NEXT:    xor a6, a3, a1
-; RV32IA-NEXT:    and a6, a6, a4
-; RV32IA-NEXT:    xor a6, a3, a6
+; RV32IA-NEXT:    xor a6, a5, a1
+; RV32IA-NEXT:    and a6, a6, a3
+; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB110_3: # in Loop: Header=BB110_1 Depth=1
 ; RV32IA-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-NEXT:    bnez a6, .LBB110_1
 ; RV32IA-NEXT:  # %bb.4:
-; RV32IA-NEXT:    srl a0, a3, a0
+; RV32IA-NEXT:    srl a0, a5, a0
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_max_i16_monotonic:
@@ -15493,62 +15493,62 @@ define i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-NOZACAS-NEXT:    li a4, 48
+; RV64IA-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT:    li a5, 48
-; RV64IA-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-NOZACAS-NEXT:  .LBB110_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-NOZACAS-NEXT:    bge a7, a1, .LBB110_3
 ; RV64IA-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB110_1 Depth=1
-; RV64IA-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-NOZACAS-NEXT:  .LBB110_3: # in Loop: Header=BB110_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-NOZACAS-NEXT:    bnez a6, .LBB110_1
 ; RV64IA-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-ZACAS-LABEL: atomicrmw_max_i16_monotonic:
 ; RV64IA-ZACAS:       # %bb.0:
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-ZACAS-NEXT:    lui a4, 16
-; RV64IA-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-ZACAS-NEXT:    li a4, 48
+; RV64IA-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT:    li a5, 48
-; RV64IA-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-ZACAS-NEXT:  .LBB110_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-ZACAS-NEXT:    mv a6, a3
-; RV64IA-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-ZACAS-NEXT:    mv a6, a5
+; RV64IA-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-ZACAS-NEXT:    bge a7, a1, .LBB110_3
 ; RV64IA-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB110_1 Depth=1
-; RV64IA-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-ZACAS-NEXT:  .LBB110_3: # in Loop: Header=BB110_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-ZACAS-NEXT:    bnez a6, .LBB110_1
 ; RV64IA-ZACAS-NEXT:  # %bb.4:
-; RV64IA-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_monotonic:
@@ -15611,62 +15611,62 @@ define i16 @atomicrmw_max_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    lui a4, 16
-; RV32IA-WMO-NEXT:    addi a4, a4, -1
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    slli a1, a1, 16
+; RV32IA-WMO-NEXT:    li a4, 16
+; RV32IA-WMO-NEXT:    andi a5, a0, 24
+; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    srai a1, a1, 16
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    li a5, 16
-; RV32IA-WMO-NEXT:    sub a5, a5, a3
+; RV32IA-WMO-NEXT:    sub a4, a4, a5
 ; RV32IA-WMO-NEXT:  .LBB111_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-WMO-NEXT:    lr.w.aq a3, (a2)
-; RV32IA-WMO-NEXT:    and a7, a3, a4
-; RV32IA-WMO-NEXT:    mv a6, a3
-; RV32IA-WMO-NEXT:    sll a7, a7, a5
-; RV32IA-WMO-NEXT:    sra a7, a7, a5
+; RV32IA-WMO-NEXT:    lr.w.aq a5, (a2)
+; RV32IA-WMO-NEXT:    and a7, a5, a3
+; RV32IA-WMO-NEXT:    mv a6, a5
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a7, a1, .LBB111_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB111_1 Depth=1
-; RV32IA-WMO-NEXT:    xor a6, a3, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
-; RV32IA-WMO-NEXT:    xor a6, a3, a6
+; RV32IA-WMO-NEXT:    xor a6, a5, a1
+; RV32IA-WMO-NEXT:    and a6, a6, a3
+; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB111_3: # in Loop: Header=BB111_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-WMO-NEXT:    bnez a6, .LBB111_1
 ; RV32IA-WMO-NEXT:  # %bb.4:
-; RV32IA-WMO-NEXT:    srl a0, a3, a0
+; RV32IA-WMO-NEXT:    srl a0, a5, a0
 ; RV32IA-WMO-NEXT:    ret
 ;
 ; RV32IA-TSO-LABEL: atomicrmw_max_i16_acquire:
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    lui a4, 16
-; RV32IA-TSO-NEXT:    addi a4, a4, -1
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    slli a1, a1, 16
+; RV32IA-TSO-NEXT:    li a4, 16
+; RV32IA-TSO-NEXT:    andi a5, a0, 24
+; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    srai a1, a1, 16
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    li a5, 16
-; RV32IA-TSO-NEXT:    sub a5, a5, a3
+; RV32IA-TSO-NEXT:    sub a4, a4, a5
 ; RV32IA-TSO-NEXT:  .LBB111_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-TSO-NEXT:    lr.w a3, (a2)
-; RV32IA-TSO-NEXT:    and a7, a3, a4
-; RV32IA-TSO-NEXT:    mv a6, a3
-; RV32IA-TSO-NEXT:    sll a7, a7, a5
-; RV32IA-TSO-NEXT:    sra a7, a7, a5
+; RV32IA-TSO-NEXT:    lr.w a5, (a2)
+; RV32IA-TSO-NEXT:    and a7, a5, a3
+; RV32IA-TSO-NEXT:    mv a6, a5
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a7, a1, .LBB111_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB111_1 Depth=1
-; RV32IA-TSO-NEXT:    xor a6, a3, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
-; RV32IA-TSO-NEXT:    xor a6, a3, a6
+; RV32IA-TSO-NEXT:    xor a6, a5, a1
+; RV32IA-TSO-NEXT:    and a6, a6, a3
+; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB111_3: # in Loop: Header=BB111_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-TSO-NEXT:    bnez a6, .LBB111_1
 ; RV32IA-TSO-NEXT:  # %bb.4:
-; RV32IA-TSO-NEXT:    srl a0, a3, a0
+; RV32IA-TSO-NEXT:    srl a0, a5, a0
 ; RV32IA-TSO-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_max_i16_acquire:
@@ -15715,124 +15715,124 @@ define i16 @atomicrmw_max_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB111_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a3, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a5, (a2)
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a7, a1, .LBB111_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB111_1 Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB111_3: # in Loop: Header=BB111_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    bnez a6, .LBB111_1
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_max_i16_acquire:
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB111_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a7, a1, .LBB111_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB111_1 Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB111_3: # in Loop: Header=BB111_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    bnez a6, .LBB111_1
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZACAS-LABEL: atomicrmw_max_i16_acquire:
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB111_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a3, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a5, (a2)
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a7, a1, .LBB111_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB111_1 Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB111_3: # in Loop: Header=BB111_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    bnez a6, .LBB111_1
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-ZACAS-LABEL: atomicrmw_max_i16_acquire:
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB111_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a7, a1, .LBB111_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB111_1 Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB111_3: # in Loop: Header=BB111_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    bnez a6, .LBB111_1
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_acquire:
@@ -15895,62 +15895,62 @@ define i16 @atomicrmw_max_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    lui a4, 16
-; RV32IA-WMO-NEXT:    addi a4, a4, -1
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    slli a1, a1, 16
+; RV32IA-WMO-NEXT:    li a4, 16
+; RV32IA-WMO-NEXT:    andi a5, a0, 24
+; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    srai a1, a1, 16
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    li a5, 16
-; RV32IA-WMO-NEXT:    sub a5, a5, a3
+; RV32IA-WMO-NEXT:    sub a4, a4, a5
 ; RV32IA-WMO-NEXT:  .LBB112_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-WMO-NEXT:    lr.w a3, (a2)
-; RV32IA-WMO-NEXT:    and a7, a3, a4
-; RV32IA-WMO-NEXT:    mv a6, a3
-; RV32IA-WMO-NEXT:    sll a7, a7, a5
-; RV32IA-WMO-NEXT:    sra a7, a7, a5
+; RV32IA-WMO-NEXT:    lr.w a5, (a2)
+; RV32IA-WMO-NEXT:    and a7, a5, a3
+; RV32IA-WMO-NEXT:    mv a6, a5
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a7, a1, .LBB112_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB112_1 Depth=1
-; RV32IA-WMO-NEXT:    xor a6, a3, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
-; RV32IA-WMO-NEXT:    xor a6, a3, a6
+; RV32IA-WMO-NEXT:    xor a6, a5, a1
+; RV32IA-WMO-NEXT:    and a6, a6, a3
+; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB112_3: # in Loop: Header=BB112_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV32IA-WMO-NEXT:    bnez a6, .LBB112_1
 ; RV32IA-WMO-NEXT:  # %bb.4:
-; RV32IA-WMO-NEXT:    srl a0, a3, a0
+; RV32IA-WMO-NEXT:    srl a0, a5, a0
 ; RV32IA-WMO-NEXT:    ret
 ;
 ; RV32IA-TSO-LABEL: atomicrmw_max_i16_release:
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    lui a4, 16
-; RV32IA-TSO-NEXT:    addi a4, a4, -1
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    slli a1, a1, 16
+; RV32IA-TSO-NEXT:    li a4, 16
+; RV32IA-TSO-NEXT:    andi a5, a0, 24
+; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    srai a1, a1, 16
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    li a5, 16
-; RV32IA-TSO-NEXT:    sub a5, a5, a3
+; RV32IA-TSO-NEXT:    sub a4, a4, a5
 ; RV32IA-TSO-NEXT:  .LBB112_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-TSO-NEXT:    lr.w a3, (a2)
-; RV32IA-TSO-NEXT:    and a7, a3, a4
-; RV32IA-TSO-NEXT:    mv a6, a3
-; RV32IA-TSO-NEXT:    sll a7, a7, a5
-; RV32IA-TSO-NEXT:    sra a7, a7, a5
+; RV32IA-TSO-NEXT:    lr.w a5, (a2)
+; RV32IA-TSO-NEXT:    and a7, a5, a3
+; RV32IA-TSO-NEXT:    mv a6, a5
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a7, a1, .LBB112_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB112_1 Depth=1
-; RV32IA-TSO-NEXT:    xor a6, a3, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
-; RV32IA-TSO-NEXT:    xor a6, a3, a6
+; RV32IA-TSO-NEXT:    xor a6, a5, a1
+; RV32IA-TSO-NEXT:    and a6, a6, a3
+; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB112_3: # in Loop: Header=BB112_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-TSO-NEXT:    bnez a6, .LBB112_1
 ; RV32IA-TSO-NEXT:  # %bb.4:
-; RV32IA-TSO-NEXT:    srl a0, a3, a0
+; RV32IA-TSO-NEXT:    srl a0, a5, a0
 ; RV32IA-TSO-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_max_i16_release:
@@ -15999,124 +15999,124 @@ define i16 @atomicrmw_max_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB112_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a7, a1, .LBB112_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB112_1 Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB112_3: # in Loop: Header=BB112_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    bnez a6, .LBB112_1
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_max_i16_release:
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB112_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a7, a1, .LBB112_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB112_1 Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB112_3: # in Loop: Header=BB112_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    bnez a6, .LBB112_1
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZACAS-LABEL: atomicrmw_max_i16_release:
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB112_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a7, a1, .LBB112_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB112_1 Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB112_3: # in Loop: Header=BB112_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    bnez a6, .LBB112_1
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-ZACAS-LABEL: atomicrmw_max_i16_release:
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB112_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a7, a1, .LBB112_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB112_1 Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB112_3: # in Loop: Header=BB112_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    bnez a6, .LBB112_1
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_release:
@@ -16179,62 +16179,62 @@ define i16 @atomicrmw_max_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    lui a4, 16
-; RV32IA-WMO-NEXT:    addi a4, a4, -1
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    slli a1, a1, 16
+; RV32IA-WMO-NEXT:    li a4, 16
+; RV32IA-WMO-NEXT:    andi a5, a0, 24
+; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    srai a1, a1, 16
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    li a5, 16
-; RV32IA-WMO-NEXT:    sub a5, a5, a3
+; RV32IA-WMO-NEXT:    sub a4, a4, a5
 ; RV32IA-WMO-NEXT:  .LBB113_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-WMO-NEXT:    lr.w.aq a3, (a2)
-; RV32IA-WMO-NEXT:    and a7, a3, a4
-; RV32IA-WMO-NEXT:    mv a6, a3
-; RV32IA-WMO-NEXT:    sll a7, a7, a5
-; RV32IA-WMO-NEXT:    sra a7, a7, a5
+; RV32IA-WMO-NEXT:    lr.w.aq a5, (a2)
+; RV32IA-WMO-NEXT:    and a7, a5, a3
+; RV32IA-WMO-NEXT:    mv a6, a5
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a7, a1, .LBB113_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB113_1 Depth=1
-; RV32IA-WMO-NEXT:    xor a6, a3, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
-; RV32IA-WMO-NEXT:    xor a6, a3, a6
+; RV32IA-WMO-NEXT:    xor a6, a5, a1
+; RV32IA-WMO-NEXT:    and a6, a6, a3
+; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB113_3: # in Loop: Header=BB113_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV32IA-WMO-NEXT:    bnez a6, .LBB113_1
 ; RV32IA-WMO-NEXT:  # %bb.4:
-; RV32IA-WMO-NEXT:    srl a0, a3, a0
+; RV32IA-WMO-NEXT:    srl a0, a5, a0
 ; RV32IA-WMO-NEXT:    ret
 ;
 ; RV32IA-TSO-LABEL: atomicrmw_max_i16_acq_rel:
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    lui a4, 16
-; RV32IA-TSO-NEXT:    addi a4, a4, -1
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    slli a1, a1, 16
+; RV32IA-TSO-NEXT:    li a4, 16
+; RV32IA-TSO-NEXT:    andi a5, a0, 24
+; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    srai a1, a1, 16
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    li a5, 16
-; RV32IA-TSO-NEXT:    sub a5, a5, a3
+; RV32IA-TSO-NEXT:    sub a4, a4, a5
 ; RV32IA-TSO-NEXT:  .LBB113_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-TSO-NEXT:    lr.w a3, (a2)
-; RV32IA-TSO-NEXT:    and a7, a3, a4
-; RV32IA-TSO-NEXT:    mv a6, a3
-; RV32IA-TSO-NEXT:    sll a7, a7, a5
-; RV32IA-TSO-NEXT:    sra a7, a7, a5
+; RV32IA-TSO-NEXT:    lr.w a5, (a2)
+; RV32IA-TSO-NEXT:    and a7, a5, a3
+; RV32IA-TSO-NEXT:    mv a6, a5
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a7, a1, .LBB113_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB113_1 Depth=1
-; RV32IA-TSO-NEXT:    xor a6, a3, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
-; RV32IA-TSO-NEXT:    xor a6, a3, a6
+; RV32IA-TSO-NEXT:    xor a6, a5, a1
+; RV32IA-TSO-NEXT:    and a6, a6, a3
+; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB113_3: # in Loop: Header=BB113_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-TSO-NEXT:    bnez a6, .LBB113_1
 ; RV32IA-TSO-NEXT:  # %bb.4:
-; RV32IA-TSO-NEXT:    srl a0, a3, a0
+; RV32IA-TSO-NEXT:    srl a0, a5, a0
 ; RV32IA-TSO-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_max_i16_acq_rel:
@@ -16283,124 +16283,124 @@ define i16 @atomicrmw_max_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB113_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a3, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a5, (a2)
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a7, a1, .LBB113_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB113_1 Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB113_3: # in Loop: Header=BB113_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    bnez a6, .LBB113_1
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_max_i16_acq_rel:
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB113_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a7, a1, .LBB113_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB113_1 Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB113_3: # in Loop: Header=BB113_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    bnez a6, .LBB113_1
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZACAS-LABEL: atomicrmw_max_i16_acq_rel:
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB113_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a3, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a5, (a2)
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a7, a1, .LBB113_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB113_1 Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB113_3: # in Loop: Header=BB113_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    bnez a6, .LBB113_1
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-ZACAS-LABEL: atomicrmw_max_i16_acq_rel:
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB113_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a7, a1, .LBB113_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB113_1 Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB113_3: # in Loop: Header=BB113_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    bnez a6, .LBB113_1
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_acq_rel:
@@ -16463,31 +16463,31 @@ define i16 @atomicrmw_max_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    lui a4, 16
-; RV32IA-NEXT:    addi a4, a4, -1
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    slli a1, a1, 16
+; RV32IA-NEXT:    li a4, 16
+; RV32IA-NEXT:    andi a5, a0, 24
+; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    srai a1, a1, 16
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    li a5, 16
-; RV32IA-NEXT:    sub a5, a5, a3
+; RV32IA-NEXT:    sub a4, a4, a5
 ; RV32IA-NEXT:  .LBB114_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    lr.w.aqrl a3, (a2)
-; RV32IA-NEXT:    and a7, a3, a4
-; RV32IA-NEXT:    mv a6, a3
-; RV32IA-NEXT:    sll a7, a7, a5
-; RV32IA-NEXT:    sra a7, a7, a5
+; RV32IA-NEXT:    lr.w.aqrl a5, (a2)
+; RV32IA-NEXT:    and a7, a5, a3
+; RV32IA-NEXT:    mv a6, a5
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a7, a1, .LBB114_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB114_1 Depth=1
-; RV32IA-NEXT:    xor a6, a3, a1
-; RV32IA-NEXT:    and a6, a6, a4
-; RV32IA-NEXT:    xor a6, a3, a6
+; RV32IA-NEXT:    xor a6, a5, a1
+; RV32IA-NEXT:    and a6, a6, a3
+; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB114_3: # in Loop: Header=BB114_1 Depth=1
 ; RV32IA-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV32IA-NEXT:    bnez a6, .LBB114_1
 ; RV32IA-NEXT:  # %bb.4:
-; RV32IA-NEXT:    srl a0, a3, a0
+; RV32IA-NEXT:    srl a0, a5, a0
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_max_i16_seq_cst:
@@ -16536,62 +16536,62 @@ define i16 @atomicrmw_max_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-NOZACAS-NEXT:    li a4, 48
+; RV64IA-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT:    li a5, 48
-; RV64IA-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-NOZACAS-NEXT:  .LBB114_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a3, (a2)
-; RV64IA-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a5, (a2)
+; RV64IA-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-NOZACAS-NEXT:    bge a7, a1, .LBB114_3
 ; RV64IA-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB114_1 Depth=1
-; RV64IA-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-NOZACAS-NEXT:  .LBB114_3: # in Loop: Header=BB114_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-NOZACAS-NEXT:    bnez a6, .LBB114_1
 ; RV64IA-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-ZACAS-LABEL: atomicrmw_max_i16_seq_cst:
 ; RV64IA-ZACAS:       # %bb.0:
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-ZACAS-NEXT:    lui a4, 16
-; RV64IA-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-ZACAS-NEXT:    li a4, 48
+; RV64IA-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT:    li a5, 48
-; RV64IA-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-ZACAS-NEXT:  .LBB114_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-ZACAS-NEXT:    lr.w.aqrl a3, (a2)
-; RV64IA-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-ZACAS-NEXT:    mv a6, a3
-; RV64IA-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-ZACAS-NEXT:    lr.w.aqrl a5, (a2)
+; RV64IA-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-ZACAS-NEXT:    mv a6, a5
+; RV64IA-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-ZACAS-NEXT:    bge a7, a1, .LBB114_3
 ; RV64IA-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB114_1 Depth=1
-; RV64IA-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-ZACAS-NEXT:  .LBB114_3: # in Loop: Header=BB114_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-ZACAS-NEXT:    bnez a6, .LBB114_1
 ; RV64IA-ZACAS-NEXT:  # %bb.4:
-; RV64IA-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_max_i16_seq_cst:
@@ -16654,31 +16654,31 @@ define i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    lui a4, 16
-; RV32IA-NEXT:    addi a4, a4, -1
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    slli a1, a1, 16
+; RV32IA-NEXT:    li a4, 16
+; RV32IA-NEXT:    andi a5, a0, 24
+; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    srai a1, a1, 16
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    li a5, 16
-; RV32IA-NEXT:    sub a5, a5, a3
+; RV32IA-NEXT:    sub a4, a4, a5
 ; RV32IA-NEXT:  .LBB115_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    lr.w a3, (a2)
-; RV32IA-NEXT:    and a7, a3, a4
-; RV32IA-NEXT:    mv a6, a3
-; RV32IA-NEXT:    sll a7, a7, a5
-; RV32IA-NEXT:    sra a7, a7, a5
+; RV32IA-NEXT:    lr.w a5, (a2)
+; RV32IA-NEXT:    and a7, a5, a3
+; RV32IA-NEXT:    mv a6, a5
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a1, a7, .LBB115_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB115_1 Depth=1
-; RV32IA-NEXT:    xor a6, a3, a1
-; RV32IA-NEXT:    and a6, a6, a4
-; RV32IA-NEXT:    xor a6, a3, a6
+; RV32IA-NEXT:    xor a6, a5, a1
+; RV32IA-NEXT:    and a6, a6, a3
+; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB115_3: # in Loop: Header=BB115_1 Depth=1
 ; RV32IA-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-NEXT:    bnez a6, .LBB115_1
 ; RV32IA-NEXT:  # %bb.4:
-; RV32IA-NEXT:    srl a0, a3, a0
+; RV32IA-NEXT:    srl a0, a5, a0
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_min_i16_monotonic:
@@ -16727,62 +16727,62 @@ define i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-NOZACAS-NEXT:    li a4, 48
+; RV64IA-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT:    li a5, 48
-; RV64IA-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-NOZACAS-NEXT:  .LBB115_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-NOZACAS-NEXT:    bge a1, a7, .LBB115_3
 ; RV64IA-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB115_1 Depth=1
-; RV64IA-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-NOZACAS-NEXT:  .LBB115_3: # in Loop: Header=BB115_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-NOZACAS-NEXT:    bnez a6, .LBB115_1
 ; RV64IA-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-ZACAS-LABEL: atomicrmw_min_i16_monotonic:
 ; RV64IA-ZACAS:       # %bb.0:
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-ZACAS-NEXT:    lui a4, 16
-; RV64IA-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-ZACAS-NEXT:    li a4, 48
+; RV64IA-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT:    li a5, 48
-; RV64IA-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-ZACAS-NEXT:  .LBB115_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-ZACAS-NEXT:    mv a6, a3
-; RV64IA-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-ZACAS-NEXT:    mv a6, a5
+; RV64IA-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-ZACAS-NEXT:    bge a1, a7, .LBB115_3
 ; RV64IA-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB115_1 Depth=1
-; RV64IA-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-ZACAS-NEXT:  .LBB115_3: # in Loop: Header=BB115_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-ZACAS-NEXT:    bnez a6, .LBB115_1
 ; RV64IA-ZACAS-NEXT:  # %bb.4:
-; RV64IA-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_monotonic:
@@ -16845,62 +16845,62 @@ define i16 @atomicrmw_min_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    lui a4, 16
-; RV32IA-WMO-NEXT:    addi a4, a4, -1
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    slli a1, a1, 16
+; RV32IA-WMO-NEXT:    li a4, 16
+; RV32IA-WMO-NEXT:    andi a5, a0, 24
+; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    srai a1, a1, 16
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    li a5, 16
-; RV32IA-WMO-NEXT:    sub a5, a5, a3
+; RV32IA-WMO-NEXT:    sub a4, a4, a5
 ; RV32IA-WMO-NEXT:  .LBB116_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-WMO-NEXT:    lr.w.aq a3, (a2)
-; RV32IA-WMO-NEXT:    and a7, a3, a4
-; RV32IA-WMO-NEXT:    mv a6, a3
-; RV32IA-WMO-NEXT:    sll a7, a7, a5
-; RV32IA-WMO-NEXT:    sra a7, a7, a5
+; RV32IA-WMO-NEXT:    lr.w.aq a5, (a2)
+; RV32IA-WMO-NEXT:    and a7, a5, a3
+; RV32IA-WMO-NEXT:    mv a6, a5
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a1, a7, .LBB116_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB116_1 Depth=1
-; RV32IA-WMO-NEXT:    xor a6, a3, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
-; RV32IA-WMO-NEXT:    xor a6, a3, a6
+; RV32IA-WMO-NEXT:    xor a6, a5, a1
+; RV32IA-WMO-NEXT:    and a6, a6, a3
+; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB116_3: # in Loop: Header=BB116_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-WMO-NEXT:    bnez a6, .LBB116_1
 ; RV32IA-WMO-NEXT:  # %bb.4:
-; RV32IA-WMO-NEXT:    srl a0, a3, a0
+; RV32IA-WMO-NEXT:    srl a0, a5, a0
 ; RV32IA-WMO-NEXT:    ret
 ;
 ; RV32IA-TSO-LABEL: atomicrmw_min_i16_acquire:
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    lui a4, 16
-; RV32IA-TSO-NEXT:    addi a4, a4, -1
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    slli a1, a1, 16
+; RV32IA-TSO-NEXT:    li a4, 16
+; RV32IA-TSO-NEXT:    andi a5, a0, 24
+; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    srai a1, a1, 16
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    li a5, 16
-; RV32IA-TSO-NEXT:    sub a5, a5, a3
+; RV32IA-TSO-NEXT:    sub a4, a4, a5
 ; RV32IA-TSO-NEXT:  .LBB116_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-TSO-NEXT:    lr.w a3, (a2)
-; RV32IA-TSO-NEXT:    and a7, a3, a4
-; RV32IA-TSO-NEXT:    mv a6, a3
-; RV32IA-TSO-NEXT:    sll a7, a7, a5
-; RV32IA-TSO-NEXT:    sra a7, a7, a5
+; RV32IA-TSO-NEXT:    lr.w a5, (a2)
+; RV32IA-TSO-NEXT:    and a7, a5, a3
+; RV32IA-TSO-NEXT:    mv a6, a5
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a1, a7, .LBB116_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB116_1 Depth=1
-; RV32IA-TSO-NEXT:    xor a6, a3, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
-; RV32IA-TSO-NEXT:    xor a6, a3, a6
+; RV32IA-TSO-NEXT:    xor a6, a5, a1
+; RV32IA-TSO-NEXT:    and a6, a6, a3
+; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB116_3: # in Loop: Header=BB116_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-TSO-NEXT:    bnez a6, .LBB116_1
 ; RV32IA-TSO-NEXT:  # %bb.4:
-; RV32IA-TSO-NEXT:    srl a0, a3, a0
+; RV32IA-TSO-NEXT:    srl a0, a5, a0
 ; RV32IA-TSO-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_min_i16_acquire:
@@ -16949,124 +16949,124 @@ define i16 @atomicrmw_min_i16_acquire(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB116_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a3, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a5, (a2)
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a1, a7, .LBB116_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB116_1 Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB116_3: # in Loop: Header=BB116_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    bnez a6, .LBB116_1
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_min_i16_acquire:
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB116_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a1, a7, .LBB116_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB116_1 Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB116_3: # in Loop: Header=BB116_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    bnez a6, .LBB116_1
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZACAS-LABEL: atomicrmw_min_i16_acquire:
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB116_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a3, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a5, (a2)
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a1, a7, .LBB116_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB116_1 Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB116_3: # in Loop: Header=BB116_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    bnez a6, .LBB116_1
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-ZACAS-LABEL: atomicrmw_min_i16_acquire:
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB116_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a1, a7, .LBB116_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB116_1 Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB116_3: # in Loop: Header=BB116_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    bnez a6, .LBB116_1
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_acquire:
@@ -17129,62 +17129,62 @@ define i16 @atomicrmw_min_i16_release(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    lui a4, 16
-; RV32IA-WMO-NEXT:    addi a4, a4, -1
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    slli a1, a1, 16
+; RV32IA-WMO-NEXT:    li a4, 16
+; RV32IA-WMO-NEXT:    andi a5, a0, 24
+; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    srai a1, a1, 16
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    li a5, 16
-; RV32IA-WMO-NEXT:    sub a5, a5, a3
+; RV32IA-WMO-NEXT:    sub a4, a4, a5
 ; RV32IA-WMO-NEXT:  .LBB117_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-WMO-NEXT:    lr.w a3, (a2)
-; RV32IA-WMO-NEXT:    and a7, a3, a4
-; RV32IA-WMO-NEXT:    mv a6, a3
-; RV32IA-WMO-NEXT:    sll a7, a7, a5
-; RV32IA-WMO-NEXT:    sra a7, a7, a5
+; RV32IA-WMO-NEXT:    lr.w a5, (a2)
+; RV32IA-WMO-NEXT:    and a7, a5, a3
+; RV32IA-WMO-NEXT:    mv a6, a5
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a1, a7, .LBB117_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB117_1 Depth=1
-; RV32IA-WMO-NEXT:    xor a6, a3, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
-; RV32IA-WMO-NEXT:    xor a6, a3, a6
+; RV32IA-WMO-NEXT:    xor a6, a5, a1
+; RV32IA-WMO-NEXT:    and a6, a6, a3
+; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB117_3: # in Loop: Header=BB117_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV32IA-WMO-NEXT:    bnez a6, .LBB117_1
 ; RV32IA-WMO-NEXT:  # %bb.4:
-; RV32IA-WMO-NEXT:    srl a0, a3, a0
+; RV32IA-WMO-NEXT:    srl a0, a5, a0
 ; RV32IA-WMO-NEXT:    ret
 ;
 ; RV32IA-TSO-LABEL: atomicrmw_min_i16_release:
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    lui a4, 16
-; RV32IA-TSO-NEXT:    addi a4, a4, -1
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    slli a1, a1, 16
+; RV32IA-TSO-NEXT:    li a4, 16
+; RV32IA-TSO-NEXT:    andi a5, a0, 24
+; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    srai a1, a1, 16
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    li a5, 16
-; RV32IA-TSO-NEXT:    sub a5, a5, a3
+; RV32IA-TSO-NEXT:    sub a4, a4, a5
 ; RV32IA-TSO-NEXT:  .LBB117_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-TSO-NEXT:    lr.w a3, (a2)
-; RV32IA-TSO-NEXT:    and a7, a3, a4
-; RV32IA-TSO-NEXT:    mv a6, a3
-; RV32IA-TSO-NEXT:    sll a7, a7, a5
-; RV32IA-TSO-NEXT:    sra a7, a7, a5
+; RV32IA-TSO-NEXT:    lr.w a5, (a2)
+; RV32IA-TSO-NEXT:    and a7, a5, a3
+; RV32IA-TSO-NEXT:    mv a6, a5
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a1, a7, .LBB117_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB117_1 Depth=1
-; RV32IA-TSO-NEXT:    xor a6, a3, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
-; RV32IA-TSO-NEXT:    xor a6, a3, a6
+; RV32IA-TSO-NEXT:    xor a6, a5, a1
+; RV32IA-TSO-NEXT:    and a6, a6, a3
+; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB117_3: # in Loop: Header=BB117_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-TSO-NEXT:    bnez a6, .LBB117_1
 ; RV32IA-TSO-NEXT:  # %bb.4:
-; RV32IA-TSO-NEXT:    srl a0, a3, a0
+; RV32IA-TSO-NEXT:    srl a0, a5, a0
 ; RV32IA-TSO-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_min_i16_release:
@@ -17233,124 +17233,124 @@ define i16 @atomicrmw_min_i16_release(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB117_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a1, a7, .LBB117_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB117_1 Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB117_3: # in Loop: Header=BB117_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    bnez a6, .LBB117_1
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_min_i16_release:
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB117_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a1, a7, .LBB117_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB117_1 Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB117_3: # in Loop: Header=BB117_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    bnez a6, .LBB117_1
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZACAS-LABEL: atomicrmw_min_i16_release:
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB117_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a1, a7, .LBB117_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB117_1 Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB117_3: # in Loop: Header=BB117_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    bnez a6, .LBB117_1
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-ZACAS-LABEL: atomicrmw_min_i16_release:
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB117_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a1, a7, .LBB117_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB117_1 Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB117_3: # in Loop: Header=BB117_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    bnez a6, .LBB117_1
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_release:
@@ -17413,62 +17413,62 @@ define i16 @atomicrmw_min_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV32IA-WMO:       # %bb.0:
 ; RV32IA-WMO-NEXT:    andi a2, a0, -4
 ; RV32IA-WMO-NEXT:    slli a0, a0, 3
-; RV32IA-WMO-NEXT:    andi a3, a0, 24
-; RV32IA-WMO-NEXT:    lui a4, 16
-; RV32IA-WMO-NEXT:    addi a4, a4, -1
-; RV32IA-WMO-NEXT:    sll a4, a4, a0
+; RV32IA-WMO-NEXT:    lui a3, 16
 ; RV32IA-WMO-NEXT:    slli a1, a1, 16
+; RV32IA-WMO-NEXT:    li a4, 16
+; RV32IA-WMO-NEXT:    andi a5, a0, 24
+; RV32IA-WMO-NEXT:    addi a3, a3, -1
 ; RV32IA-WMO-NEXT:    srai a1, a1, 16
+; RV32IA-WMO-NEXT:    sll a3, a3, a0
 ; RV32IA-WMO-NEXT:    sll a1, a1, a0
-; RV32IA-WMO-NEXT:    li a5, 16
-; RV32IA-WMO-NEXT:    sub a5, a5, a3
+; RV32IA-WMO-NEXT:    sub a4, a4, a5
 ; RV32IA-WMO-NEXT:  .LBB118_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-WMO-NEXT:    lr.w.aq a3, (a2)
-; RV32IA-WMO-NEXT:    and a7, a3, a4
-; RV32IA-WMO-NEXT:    mv a6, a3
-; RV32IA-WMO-NEXT:    sll a7, a7, a5
-; RV32IA-WMO-NEXT:    sra a7, a7, a5
+; RV32IA-WMO-NEXT:    lr.w.aq a5, (a2)
+; RV32IA-WMO-NEXT:    and a7, a5, a3
+; RV32IA-WMO-NEXT:    mv a6, a5
+; RV32IA-WMO-NEXT:    sll a7, a7, a4
+; RV32IA-WMO-NEXT:    sra a7, a7, a4
 ; RV32IA-WMO-NEXT:    bge a1, a7, .LBB118_3
 ; RV32IA-WMO-NEXT:  # %bb.2: # in Loop: Header=BB118_1 Depth=1
-; RV32IA-WMO-NEXT:    xor a6, a3, a1
-; RV32IA-WMO-NEXT:    and a6, a6, a4
-; RV32IA-WMO-NEXT:    xor a6, a3, a6
+; RV32IA-WMO-NEXT:    xor a6, a5, a1
+; RV32IA-WMO-NEXT:    and a6, a6, a3
+; RV32IA-WMO-NEXT:    xor a6, a5, a6
 ; RV32IA-WMO-NEXT:  .LBB118_3: # in Loop: Header=BB118_1 Depth=1
 ; RV32IA-WMO-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV32IA-WMO-NEXT:    bnez a6, .LBB118_1
 ; RV32IA-WMO-NEXT:  # %bb.4:
-; RV32IA-WMO-NEXT:    srl a0, a3, a0
+; RV32IA-WMO-NEXT:    srl a0, a5, a0
 ; RV32IA-WMO-NEXT:    ret
 ;
 ; RV32IA-TSO-LABEL: atomicrmw_min_i16_acq_rel:
 ; RV32IA-TSO:       # %bb.0:
 ; RV32IA-TSO-NEXT:    andi a2, a0, -4
 ; RV32IA-TSO-NEXT:    slli a0, a0, 3
-; RV32IA-TSO-NEXT:    andi a3, a0, 24
-; RV32IA-TSO-NEXT:    lui a4, 16
-; RV32IA-TSO-NEXT:    addi a4, a4, -1
-; RV32IA-TSO-NEXT:    sll a4, a4, a0
+; RV32IA-TSO-NEXT:    lui a3, 16
 ; RV32IA-TSO-NEXT:    slli a1, a1, 16
+; RV32IA-TSO-NEXT:    li a4, 16
+; RV32IA-TSO-NEXT:    andi a5, a0, 24
+; RV32IA-TSO-NEXT:    addi a3, a3, -1
 ; RV32IA-TSO-NEXT:    srai a1, a1, 16
+; RV32IA-TSO-NEXT:    sll a3, a3, a0
 ; RV32IA-TSO-NEXT:    sll a1, a1, a0
-; RV32IA-TSO-NEXT:    li a5, 16
-; RV32IA-TSO-NEXT:    sub a5, a5, a3
+; RV32IA-TSO-NEXT:    sub a4, a4, a5
 ; RV32IA-TSO-NEXT:  .LBB118_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-TSO-NEXT:    lr.w a3, (a2)
-; RV32IA-TSO-NEXT:    and a7, a3, a4
-; RV32IA-TSO-NEXT:    mv a6, a3
-; RV32IA-TSO-NEXT:    sll a7, a7, a5
-; RV32IA-TSO-NEXT:    sra a7, a7, a5
+; RV32IA-TSO-NEXT:    lr.w a5, (a2)
+; RV32IA-TSO-NEXT:    and a7, a5, a3
+; RV32IA-TSO-NEXT:    mv a6, a5
+; RV32IA-TSO-NEXT:    sll a7, a7, a4
+; RV32IA-TSO-NEXT:    sra a7, a7, a4
 ; RV32IA-TSO-NEXT:    bge a1, a7, .LBB118_3
 ; RV32IA-TSO-NEXT:  # %bb.2: # in Loop: Header=BB118_1 Depth=1
-; RV32IA-TSO-NEXT:    xor a6, a3, a1
-; RV32IA-TSO-NEXT:    and a6, a6, a4
-; RV32IA-TSO-NEXT:    xor a6, a3, a6
+; RV32IA-TSO-NEXT:    xor a6, a5, a1
+; RV32IA-TSO-NEXT:    and a6, a6, a3
+; RV32IA-TSO-NEXT:    xor a6, a5, a6
 ; RV32IA-TSO-NEXT:  .LBB118_3: # in Loop: Header=BB118_1 Depth=1
 ; RV32IA-TSO-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-TSO-NEXT:    bnez a6, .LBB118_1
 ; RV32IA-TSO-NEXT:  # %bb.4:
-; RV32IA-TSO-NEXT:    srl a0, a3, a0
+; RV32IA-TSO-NEXT:    srl a0, a5, a0
 ; RV32IA-TSO-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_min_i16_acq_rel:
@@ -17517,124 +17517,124 @@ define i16 @atomicrmw_min_i16_acq_rel(ptr %a, i16 %b) nounwind {
 ; RV64IA-WMO-NOZACAS:       # %bb.0:
 ; RV64IA-WMO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB118_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a3, (a2)
-; RV64IA-WMO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-NOZACAS-NEXT:    lr.w.aq a5, (a2)
+; RV64IA-WMO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-NOZACAS-NEXT:    bge a1, a7, .LBB118_3
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB118_1 Depth=1
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-NOZACAS-NEXT:  .LBB118_3: # in Loop: Header=BB118_1 Depth=1
 ; RV64IA-WMO-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-WMO-NOZACAS-NEXT:    bnez a6, .LBB118_1
 ; RV64IA-WMO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-NOZACAS-LABEL: atomicrmw_min_i16_acq_rel:
 ; RV64IA-TSO-NOZACAS:       # %bb.0:
 ; RV64IA-TSO-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-NOZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB118_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-NOZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-NOZACAS-NEXT:    bge a1, a7, .LBB118_3
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB118_1 Depth=1
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-NOZACAS-NEXT:  .LBB118_3: # in Loop: Header=BB118_1 Depth=1
 ; RV64IA-TSO-NOZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-NOZACAS-NEXT:    bnez a6, .LBB118_1
 ; RV64IA-TSO-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZACAS-LABEL: atomicrmw_min_i16_acq_rel:
 ; RV64IA-WMO-ZACAS:       # %bb.0:
 ; RV64IA-WMO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-WMO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-WMO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-WMO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-WMO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-WMO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-WMO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-WMO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    li a4, 48
+; RV64IA-WMO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-WMO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-WMO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-WMO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-WMO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-WMO-ZACAS-NEXT:    li a5, 48
-; RV64IA-WMO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB118_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a3, (a2)
-; RV64IA-WMO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-WMO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-WMO-ZACAS-NEXT:    lr.w.aq a5, (a2)
+; RV64IA-WMO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-WMO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-WMO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-WMO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-WMO-ZACAS-NEXT:    bge a1, a7, .LBB118_3
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB118_1 Depth=1
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-WMO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-WMO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-WMO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-WMO-ZACAS-NEXT:  .LBB118_3: # in Loop: Header=BB118_1 Depth=1
 ; RV64IA-WMO-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-WMO-ZACAS-NEXT:    bnez a6, .LBB118_1
 ; RV64IA-WMO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-WMO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-WMO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-TSO-ZACAS-LABEL: atomicrmw_min_i16_acq_rel:
 ; RV64IA-TSO-ZACAS:       # %bb.0:
 ; RV64IA-TSO-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-TSO-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-TSO-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-TSO-ZACAS-NEXT:    lui a4, 16
-; RV64IA-TSO-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-TSO-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-TSO-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-TSO-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    li a4, 48
+; RV64IA-TSO-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-TSO-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-TSO-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-TSO-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-TSO-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-TSO-ZACAS-NEXT:    li a5, 48
-; RV64IA-TSO-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB118_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    lr.w a3, (a2)
-; RV64IA-TSO-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-TSO-ZACAS-NEXT:    mv a6, a3
-; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-TSO-ZACAS-NEXT:    lr.w a5, (a2)
+; RV64IA-TSO-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-TSO-ZACAS-NEXT:    mv a6, a5
+; RV64IA-TSO-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-TSO-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-TSO-ZACAS-NEXT:    bge a1, a7, .LBB118_3
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB118_1 Depth=1
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-TSO-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-TSO-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-TSO-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-TSO-ZACAS-NEXT:  .LBB118_3: # in Loop: Header=BB118_1 Depth=1
 ; RV64IA-TSO-ZACAS-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-TSO-ZACAS-NEXT:    bnez a6, .LBB118_1
 ; RV64IA-TSO-ZACAS-NEXT:  # %bb.4:
-; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-TSO-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-TSO-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_acq_rel:
@@ -17697,31 +17697,31 @@ define i16 @atomicrmw_min_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    lui a4, 16
-; RV32IA-NEXT:    addi a4, a4, -1
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    slli a1, a1, 16
+; RV32IA-NEXT:    li a4, 16
+; RV32IA-NEXT:    andi a5, a0, 24
+; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    srai a1, a1, 16
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    li a5, 16
-; RV32IA-NEXT:    sub a5, a5, a3
+; RV32IA-NEXT:    sub a4, a4, a5
 ; RV32IA-NEXT:  .LBB119_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    lr.w.aqrl a3, (a2)
-; RV32IA-NEXT:    and a7, a3, a4
-; RV32IA-NEXT:    mv a6, a3
-; RV32IA-NEXT:    sll a7, a7, a5
-; RV32IA-NEXT:    sra a7, a7, a5
+; RV32IA-NEXT:    lr.w.aqrl a5, (a2)
+; RV32IA-NEXT:    and a7, a5, a3
+; RV32IA-NEXT:    mv a6, a5
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a1, a7, .LBB119_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB119_1 Depth=1
-; RV32IA-NEXT:    xor a6, a3, a1
-; RV32IA-NEXT:    and a6, a6, a4
-; RV32IA-NEXT:    xor a6, a3, a6
+; RV32IA-NEXT:    xor a6, a5, a1
+; RV32IA-NEXT:    and a6, a6, a3
+; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB119_3: # in Loop: Header=BB119_1 Depth=1
 ; RV32IA-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV32IA-NEXT:    bnez a6, .LBB119_1
 ; RV32IA-NEXT:  # %bb.4:
-; RV32IA-NEXT:    srl a0, a3, a0
+; RV32IA-NEXT:    srl a0, a5, a0
 ; RV32IA-NEXT:    ret
 ;
 ; RV64I-LABEL: atomicrmw_min_i16_seq_cst:
@@ -17770,62 +17770,62 @@ define i16 @atomicrmw_min_i16_seq_cst(ptr %a, i16 %b) nounwind {
 ; RV64IA-NOZACAS:       # %bb.0:
 ; RV64IA-NOZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-NOZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-NOZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-NOZACAS-NEXT:    lui a4, 16
-; RV64IA-NOZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-NOZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-NOZACAS-NEXT:    lui a3, 16
 ; RV64IA-NOZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-NOZACAS-NEXT:    li a4, 48
+; RV64IA-NOZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-NOZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-NOZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-NOZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-NOZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-NOZACAS-NEXT:    li a5, 48
-; RV64IA-NOZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-NOZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-NOZACAS-NEXT:  .LBB119_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a3, (a2)
-; RV64IA-NOZACAS-NEXT:    and a7, a3, a4
-; RV64IA-NOZACAS-NEXT:    mv a6, a3
-; RV64IA-NOZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-NOZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-NOZACAS-NEXT:    lr.w.aqrl a5, (a2)
+; RV64IA-NOZACAS-NEXT:    and a7, a5, a3
+; RV64IA-NOZACAS-NEXT:    mv a6, a5
+; RV64IA-NOZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-NOZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-NOZACAS-NEXT:    bge a1, a7, .LBB119_3
 ; RV64IA-NOZACAS-NEXT:  # %bb.2: # in Loop: Header=BB119_1 Depth=1
-; RV64IA-NOZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-NOZACAS-NEXT:    and a6, a6, a4
-; RV64IA-NOZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-NOZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-NOZACAS-NEXT:    and a6, a6, a3
+; RV64IA-NOZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-NOZACAS-NEXT:  .LBB119_3: # in Loop: Header=BB119_1 Depth=1
 ; RV64IA-NOZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-NOZACAS-NEXT:    bnez a6, .LBB119_1
 ; RV64IA-NOZACAS-NEXT:  # %bb.4:
-; RV64IA-NOZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-NOZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-NOZACAS-NEXT:    ret
 ;
 ; RV64IA-ZACAS-LABEL: atomicrmw_min_i16_seq_cst:
 ; RV64IA-ZACAS:       # %bb.0:
 ; RV64IA-ZACAS-NEXT:    andi a2, a0, -4
 ; RV64IA-ZACAS-NEXT:    slli a0, a0, 3
-; RV64IA-ZACAS-NEXT:    andi a3, a0, 24
-; RV64IA-ZACAS-NEXT:    lui a4, 16
-; RV64IA-ZACAS-NEXT:    addi a4, a4, -1
-; RV64IA-ZACAS-NEXT:    sllw a4, a4, a0
+; RV64IA-ZACAS-NEXT:    lui a3, 16
 ; RV64IA-ZACAS-NEXT:    slli a1, a1, 48
+; RV64IA-ZACAS-NEXT:    li a4, 48
+; RV64IA-ZACAS-NEXT:    andi a5, a0, 24
+; RV64IA-ZACAS-NEXT:    addi a3, a3, -1
 ; RV64IA-ZACAS-NEXT:    srai a1, a1, 48
+; RV64IA-ZACAS-NEXT:    sllw a3, a3, a0
 ; RV64IA-ZACAS-NEXT:    sllw a1, a1, a0
-; RV64IA-ZACAS-NEXT:    li a5, 48
-; RV64IA-ZACAS-NEXT:    sub a5, a5, a3
+; RV64IA-ZACAS-NEXT:    sub a4, a4, a5
 ; RV64IA-ZACAS-NEXT:  .LBB119_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-ZACAS-NEXT:    lr.w.aqrl a3, (a2)
-; RV64IA-ZACAS-NEXT:    and a7, a3, a4
-; RV64IA-ZACAS-NEXT:    mv a6, a3
-; RV64IA-ZACAS-NEXT:    sll a7, a7, a5
-; RV64IA-ZACAS-NEXT:    sra a7, a7, a5
+; RV64IA-ZACAS-NEXT:    lr.w.aqrl a5, (a2)
+; RV64IA-ZACAS-NEXT:    and a7, a5, a3
+; RV64IA-ZACAS-NEXT:    mv a6, a5
+; RV64IA-ZACAS-NEXT:    sll a7, a7, a4
+; RV64IA-ZACAS-NEXT:    sra a7, a7, a4
 ; RV64IA-ZACAS-NEXT:    bge a1, a7, .LBB119_3
 ; RV64IA-ZACAS-NEXT:  # %bb.2: # in Loop: Header=BB119_1 Depth=1
-; RV64IA-ZACAS-NEXT:    xor a6, a3, a1
-; RV64IA-ZACAS-NEXT:    and a6, a6, a4
-; RV64IA-ZACAS-NEXT:    xor a6, a3, a6
+; RV64IA-ZACAS-NEXT:    xor a6, a5, a1
+; RV64IA-ZACAS-NEXT:    and a6, a6, a3
+; RV64IA-ZACAS-NEXT:    xor a6, a5, a6
 ; RV64IA-ZACAS-NEXT:  .LBB119_3: # in Loop: Header=BB119_1 Depth=1
 ; RV64IA-ZACAS-NEXT:    sc.w.rl a6, a6, (a2)
 ; RV64IA-ZACAS-NEXT:    bnez a6, .LBB119_1
 ; RV64IA-ZACAS-NEXT:  # %bb.4:
-; RV64IA-ZACAS-NEXT:    srlw a0, a3, a0
+; RV64IA-ZACAS-NEXT:    srlw a0, a5, a0
 ; RV64IA-ZACAS-NEXT:    ret
 ;
 ; RV64IA-WMO-ZABHA-LABEL: atomicrmw_min_i16_seq_cst:
@@ -26072,46 +26072,46 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB220_2
 ; RV32I-NEXT:  .LBB220_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB220_7
 ; RV32I-NEXT:  .LBB220_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB220_4
+; RV32I-NEXT:    beq a5, s0, .LBB220_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a4
+; RV32I-NEXT:    slt a0, s0, a5
 ; RV32I-NEXT:    j .LBB220_5
 ; RV32I-NEXT:  .LBB220_4: # in Loop: Header=BB220_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB220_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    bnez a0, .LBB220_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB220_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB220_1
 ; RV32I-NEXT:  .LBB220_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26126,46 +26126,46 @@ define i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB220_2
 ; RV32IA-NEXT:  .LBB220_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB220_7
 ; RV32IA-NEXT:  .LBB220_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB220_4
+; RV32IA-NEXT:    beq a5, s0, .LBB220_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a4
+; RV32IA-NEXT:    slt a0, s0, a5
 ; RV32IA-NEXT:    j .LBB220_5
 ; RV32IA-NEXT:  .LBB220_4: # in Loop: Header=BB220_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB220_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB220_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    bnez a0, .LBB220_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB220_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB220_1
 ; RV32IA-NEXT:  .LBB220_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26225,46 +26225,46 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB221_2
 ; RV32I-NEXT:  .LBB221_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    li a5, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB221_7
 ; RV32I-NEXT:  .LBB221_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB221_4
+; RV32I-NEXT:    beq a5, s0, .LBB221_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a4
+; RV32I-NEXT:    slt a0, s0, a5
 ; RV32I-NEXT:    j .LBB221_5
 ; RV32I-NEXT:  .LBB221_4: # in Loop: Header=BB221_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB221_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    bnez a0, .LBB221_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB221_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB221_1
 ; RV32I-NEXT:  .LBB221_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26279,46 +26279,46 @@ define i64 @atomicrmw_max_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB221_2
 ; RV32IA-NEXT:  .LBB221_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 2
 ; RV32IA-NEXT:    li a5, 2
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB221_7
 ; RV32IA-NEXT:  .LBB221_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB221_4
+; RV32IA-NEXT:    beq a5, s0, .LBB221_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a4
+; RV32IA-NEXT:    slt a0, s0, a5
 ; RV32IA-NEXT:    j .LBB221_5
 ; RV32IA-NEXT:  .LBB221_4: # in Loop: Header=BB221_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB221_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB221_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    bnez a0, .LBB221_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB221_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB221_1
 ; RV32IA-NEXT:  .LBB221_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26383,46 +26383,46 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB222_2
 ; RV32I-NEXT:  .LBB222_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 3
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB222_7
 ; RV32I-NEXT:  .LBB222_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB222_4
+; RV32I-NEXT:    beq a5, s0, .LBB222_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a4
+; RV32I-NEXT:    slt a0, s0, a5
 ; RV32I-NEXT:    j .LBB222_5
 ; RV32I-NEXT:  .LBB222_4: # in Loop: Header=BB222_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB222_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    bnez a0, .LBB222_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB222_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB222_1
 ; RV32I-NEXT:  .LBB222_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26437,46 +26437,46 @@ define i64 @atomicrmw_max_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB222_2
 ; RV32IA-NEXT:  .LBB222_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 3
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB222_7
 ; RV32IA-NEXT:  .LBB222_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB222_4
+; RV32IA-NEXT:    beq a5, s0, .LBB222_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a4
+; RV32IA-NEXT:    slt a0, s0, a5
 ; RV32IA-NEXT:    j .LBB222_5
 ; RV32IA-NEXT:  .LBB222_4: # in Loop: Header=BB222_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB222_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB222_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    bnez a0, .LBB222_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB222_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB222_1
 ; RV32IA-NEXT:  .LBB222_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26541,46 +26541,46 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB223_2
 ; RV32I-NEXT:  .LBB223_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 4
 ; RV32I-NEXT:    li a5, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB223_7
 ; RV32I-NEXT:  .LBB223_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB223_4
+; RV32I-NEXT:    beq a5, s0, .LBB223_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a4
+; RV32I-NEXT:    slt a0, s0, a5
 ; RV32I-NEXT:    j .LBB223_5
 ; RV32I-NEXT:  .LBB223_4: # in Loop: Header=BB223_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB223_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    bnez a0, .LBB223_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB223_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB223_1
 ; RV32I-NEXT:  .LBB223_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26595,46 +26595,46 @@ define i64 @atomicrmw_max_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB223_2
 ; RV32IA-NEXT:  .LBB223_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 4
 ; RV32IA-NEXT:    li a5, 2
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB223_7
 ; RV32IA-NEXT:  .LBB223_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB223_4
+; RV32IA-NEXT:    beq a5, s0, .LBB223_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a4
+; RV32IA-NEXT:    slt a0, s0, a5
 ; RV32IA-NEXT:    j .LBB223_5
 ; RV32IA-NEXT:  .LBB223_4: # in Loop: Header=BB223_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB223_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB223_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    bnez a0, .LBB223_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB223_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB223_1
 ; RV32IA-NEXT:  .LBB223_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26699,46 +26699,46 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB224_2
 ; RV32I-NEXT:  .LBB224_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB224_7
 ; RV32I-NEXT:  .LBB224_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB224_4
+; RV32I-NEXT:    beq a5, s0, .LBB224_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a4
+; RV32I-NEXT:    slt a0, s0, a5
 ; RV32I-NEXT:    j .LBB224_5
 ; RV32I-NEXT:  .LBB224_4: # in Loop: Header=BB224_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB224_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    bnez a0, .LBB224_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB224_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB224_1
 ; RV32I-NEXT:  .LBB224_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26753,46 +26753,46 @@ define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB224_2
 ; RV32IA-NEXT:  .LBB224_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB224_7
 ; RV32IA-NEXT:  .LBB224_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB224_4
+; RV32IA-NEXT:    beq a5, s0, .LBB224_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a4
+; RV32IA-NEXT:    slt a0, s0, a5
 ; RV32IA-NEXT:    j .LBB224_5
 ; RV32IA-NEXT:  .LBB224_4: # in Loop: Header=BB224_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB224_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB224_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    bnez a0, .LBB224_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB224_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB224_1
 ; RV32IA-NEXT:  .LBB224_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26857,46 +26857,46 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB225_2
 ; RV32I-NEXT:  .LBB225_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB225_7
 ; RV32I-NEXT:  .LBB225_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB225_4
+; RV32I-NEXT:    beq a5, s0, .LBB225_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a4
+; RV32I-NEXT:    slt a0, s0, a5
 ; RV32I-NEXT:    j .LBB225_5
 ; RV32I-NEXT:  .LBB225_4: # in Loop: Header=BB225_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB225_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    beqz a0, .LBB225_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB225_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB225_1
 ; RV32I-NEXT:  .LBB225_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -26911,46 +26911,46 @@ define i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB225_2
 ; RV32IA-NEXT:  .LBB225_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB225_7
 ; RV32IA-NEXT:  .LBB225_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB225_4
+; RV32IA-NEXT:    beq a5, s0, .LBB225_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a4
+; RV32IA-NEXT:    slt a0, s0, a5
 ; RV32IA-NEXT:    j .LBB225_5
 ; RV32IA-NEXT:  .LBB225_4: # in Loop: Header=BB225_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB225_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB225_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    beqz a0, .LBB225_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB225_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB225_1
 ; RV32IA-NEXT:  .LBB225_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27010,46 +27010,46 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB226_2
 ; RV32I-NEXT:  .LBB226_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    li a5, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB226_7
 ; RV32I-NEXT:  .LBB226_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB226_4
+; RV32I-NEXT:    beq a5, s0, .LBB226_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a4
+; RV32I-NEXT:    slt a0, s0, a5
 ; RV32I-NEXT:    j .LBB226_5
 ; RV32I-NEXT:  .LBB226_4: # in Loop: Header=BB226_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB226_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    beqz a0, .LBB226_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB226_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB226_1
 ; RV32I-NEXT:  .LBB226_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27064,46 +27064,46 @@ define i64 @atomicrmw_min_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB226_2
 ; RV32IA-NEXT:  .LBB226_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 2
 ; RV32IA-NEXT:    li a5, 2
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB226_7
 ; RV32IA-NEXT:  .LBB226_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB226_4
+; RV32IA-NEXT:    beq a5, s0, .LBB226_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a4
+; RV32IA-NEXT:    slt a0, s0, a5
 ; RV32IA-NEXT:    j .LBB226_5
 ; RV32IA-NEXT:  .LBB226_4: # in Loop: Header=BB226_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB226_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB226_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    beqz a0, .LBB226_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB226_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB226_1
 ; RV32IA-NEXT:  .LBB226_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27168,46 +27168,46 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB227_2
 ; RV32I-NEXT:  .LBB227_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 3
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB227_7
 ; RV32I-NEXT:  .LBB227_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB227_4
+; RV32I-NEXT:    beq a5, s0, .LBB227_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a4
+; RV32I-NEXT:    slt a0, s0, a5
 ; RV32I-NEXT:    j .LBB227_5
 ; RV32I-NEXT:  .LBB227_4: # in Loop: Header=BB227_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB227_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    beqz a0, .LBB227_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB227_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB227_1
 ; RV32I-NEXT:  .LBB227_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27222,46 +27222,46 @@ define i64 @atomicrmw_min_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB227_2
 ; RV32IA-NEXT:  .LBB227_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 3
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB227_7
 ; RV32IA-NEXT:  .LBB227_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB227_4
+; RV32IA-NEXT:    beq a5, s0, .LBB227_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a4
+; RV32IA-NEXT:    slt a0, s0, a5
 ; RV32IA-NEXT:    j .LBB227_5
 ; RV32IA-NEXT:  .LBB227_4: # in Loop: Header=BB227_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB227_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB227_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    beqz a0, .LBB227_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB227_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB227_1
 ; RV32IA-NEXT:  .LBB227_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27326,46 +27326,46 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB228_2
 ; RV32I-NEXT:  .LBB228_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 4
 ; RV32I-NEXT:    li a5, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB228_7
 ; RV32I-NEXT:  .LBB228_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB228_4
+; RV32I-NEXT:    beq a5, s0, .LBB228_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a4
+; RV32I-NEXT:    slt a0, s0, a5
 ; RV32I-NEXT:    j .LBB228_5
 ; RV32I-NEXT:  .LBB228_4: # in Loop: Header=BB228_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB228_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    beqz a0, .LBB228_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB228_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB228_1
 ; RV32I-NEXT:  .LBB228_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27380,46 +27380,46 @@ define i64 @atomicrmw_min_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB228_2
 ; RV32IA-NEXT:  .LBB228_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 4
 ; RV32IA-NEXT:    li a5, 2
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB228_7
 ; RV32IA-NEXT:  .LBB228_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB228_4
+; RV32IA-NEXT:    beq a5, s0, .LBB228_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a4
+; RV32IA-NEXT:    slt a0, s0, a5
 ; RV32IA-NEXT:    j .LBB228_5
 ; RV32IA-NEXT:  .LBB228_4: # in Loop: Header=BB228_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB228_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB228_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    beqz a0, .LBB228_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB228_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB228_1
 ; RV32IA-NEXT:  .LBB228_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27484,46 +27484,46 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB229_2
 ; RV32I-NEXT:  .LBB229_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB229_7
 ; RV32I-NEXT:  .LBB229_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB229_4
+; RV32I-NEXT:    beq a5, s0, .LBB229_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a4
+; RV32I-NEXT:    slt a0, s0, a5
 ; RV32I-NEXT:    j .LBB229_5
 ; RV32I-NEXT:  .LBB229_4: # in Loop: Header=BB229_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB229_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    beqz a0, .LBB229_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB229_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB229_1
 ; RV32I-NEXT:  .LBB229_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27538,46 +27538,46 @@ define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB229_2
 ; RV32IA-NEXT:  .LBB229_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB229_7
 ; RV32IA-NEXT:  .LBB229_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB229_4
+; RV32IA-NEXT:    beq a5, s0, .LBB229_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a4
+; RV32IA-NEXT:    slt a0, s0, a5
 ; RV32IA-NEXT:    j .LBB229_5
 ; RV32IA-NEXT:  .LBB229_4: # in Loop: Header=BB229_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB229_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB229_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    beqz a0, .LBB229_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB229_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB229_1
 ; RV32IA-NEXT:  .LBB229_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27642,46 +27642,46 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB230_2
 ; RV32I-NEXT:  .LBB230_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB230_7
 ; RV32I-NEXT:  .LBB230_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB230_4
+; RV32I-NEXT:    beq a5, s0, .LBB230_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a4
+; RV32I-NEXT:    sltu a0, s0, a5
 ; RV32I-NEXT:    j .LBB230_5
 ; RV32I-NEXT:  .LBB230_4: # in Loop: Header=BB230_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB230_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    bnez a0, .LBB230_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB230_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB230_1
 ; RV32I-NEXT:  .LBB230_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27696,46 +27696,46 @@ define i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB230_2
 ; RV32IA-NEXT:  .LBB230_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB230_7
 ; RV32IA-NEXT:  .LBB230_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB230_4
+; RV32IA-NEXT:    beq a5, s0, .LBB230_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a4
+; RV32IA-NEXT:    sltu a0, s0, a5
 ; RV32IA-NEXT:    j .LBB230_5
 ; RV32IA-NEXT:  .LBB230_4: # in Loop: Header=BB230_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB230_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB230_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    bnez a0, .LBB230_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB230_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB230_1
 ; RV32IA-NEXT:  .LBB230_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27795,46 +27795,46 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB231_2
 ; RV32I-NEXT:  .LBB231_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    li a5, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB231_7
 ; RV32I-NEXT:  .LBB231_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB231_4
+; RV32I-NEXT:    beq a5, s0, .LBB231_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a4
+; RV32I-NEXT:    sltu a0, s0, a5
 ; RV32I-NEXT:    j .LBB231_5
 ; RV32I-NEXT:  .LBB231_4: # in Loop: Header=BB231_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB231_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    bnez a0, .LBB231_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB231_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB231_1
 ; RV32I-NEXT:  .LBB231_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27849,46 +27849,46 @@ define i64 @atomicrmw_umax_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB231_2
 ; RV32IA-NEXT:  .LBB231_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 2
 ; RV32IA-NEXT:    li a5, 2
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB231_7
 ; RV32IA-NEXT:  .LBB231_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB231_4
+; RV32IA-NEXT:    beq a5, s0, .LBB231_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a4
+; RV32IA-NEXT:    sltu a0, s0, a5
 ; RV32IA-NEXT:    j .LBB231_5
 ; RV32IA-NEXT:  .LBB231_4: # in Loop: Header=BB231_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB231_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB231_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    bnez a0, .LBB231_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB231_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB231_1
 ; RV32IA-NEXT:  .LBB231_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -27953,46 +27953,46 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB232_2
 ; RV32I-NEXT:  .LBB232_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 3
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB232_7
 ; RV32I-NEXT:  .LBB232_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB232_4
+; RV32I-NEXT:    beq a5, s0, .LBB232_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a4
+; RV32I-NEXT:    sltu a0, s0, a5
 ; RV32I-NEXT:    j .LBB232_5
 ; RV32I-NEXT:  .LBB232_4: # in Loop: Header=BB232_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB232_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    bnez a0, .LBB232_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB232_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB232_1
 ; RV32I-NEXT:  .LBB232_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28007,46 +28007,46 @@ define i64 @atomicrmw_umax_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB232_2
 ; RV32IA-NEXT:  .LBB232_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 3
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB232_7
 ; RV32IA-NEXT:  .LBB232_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB232_4
+; RV32IA-NEXT:    beq a5, s0, .LBB232_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a4
+; RV32IA-NEXT:    sltu a0, s0, a5
 ; RV32IA-NEXT:    j .LBB232_5
 ; RV32IA-NEXT:  .LBB232_4: # in Loop: Header=BB232_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB232_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB232_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    bnez a0, .LBB232_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB232_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB232_1
 ; RV32IA-NEXT:  .LBB232_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28111,46 +28111,46 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB233_2
 ; RV32I-NEXT:  .LBB233_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 4
 ; RV32I-NEXT:    li a5, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB233_7
 ; RV32I-NEXT:  .LBB233_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB233_4
+; RV32I-NEXT:    beq a5, s0, .LBB233_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a4
+; RV32I-NEXT:    sltu a0, s0, a5
 ; RV32I-NEXT:    j .LBB233_5
 ; RV32I-NEXT:  .LBB233_4: # in Loop: Header=BB233_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB233_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    bnez a0, .LBB233_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB233_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB233_1
 ; RV32I-NEXT:  .LBB233_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28165,46 +28165,46 @@ define i64 @atomicrmw_umax_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB233_2
 ; RV32IA-NEXT:  .LBB233_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 4
 ; RV32IA-NEXT:    li a5, 2
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB233_7
 ; RV32IA-NEXT:  .LBB233_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB233_4
+; RV32IA-NEXT:    beq a5, s0, .LBB233_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a4
+; RV32IA-NEXT:    sltu a0, s0, a5
 ; RV32IA-NEXT:    j .LBB233_5
 ; RV32IA-NEXT:  .LBB233_4: # in Loop: Header=BB233_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB233_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB233_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    bnez a0, .LBB233_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB233_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB233_1
 ; RV32IA-NEXT:  .LBB233_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28269,46 +28269,46 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB234_2
 ; RV32I-NEXT:  .LBB234_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB234_7
 ; RV32I-NEXT:  .LBB234_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB234_4
+; RV32I-NEXT:    beq a5, s0, .LBB234_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a4
+; RV32I-NEXT:    sltu a0, s0, a5
 ; RV32I-NEXT:    j .LBB234_5
 ; RV32I-NEXT:  .LBB234_4: # in Loop: Header=BB234_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB234_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    bnez a0, .LBB234_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB234_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB234_1
 ; RV32I-NEXT:  .LBB234_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28323,46 +28323,46 @@ define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB234_2
 ; RV32IA-NEXT:  .LBB234_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB234_7
 ; RV32IA-NEXT:  .LBB234_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB234_4
+; RV32IA-NEXT:    beq a5, s0, .LBB234_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a4
+; RV32IA-NEXT:    sltu a0, s0, a5
 ; RV32IA-NEXT:    j .LBB234_5
 ; RV32IA-NEXT:  .LBB234_4: # in Loop: Header=BB234_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB234_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB234_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    bnez a0, .LBB234_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB234_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB234_1
 ; RV32IA-NEXT:  .LBB234_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28427,46 +28427,46 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB235_2
 ; RV32I-NEXT:  .LBB235_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB235_7
 ; RV32I-NEXT:  .LBB235_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB235_4
+; RV32I-NEXT:    beq a5, s0, .LBB235_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a4
+; RV32I-NEXT:    sltu a0, s0, a5
 ; RV32I-NEXT:    j .LBB235_5
 ; RV32I-NEXT:  .LBB235_4: # in Loop: Header=BB235_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB235_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    beqz a0, .LBB235_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB235_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB235_1
 ; RV32I-NEXT:  .LBB235_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28481,46 +28481,46 @@ define i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB235_2
 ; RV32IA-NEXT:  .LBB235_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB235_7
 ; RV32IA-NEXT:  .LBB235_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB235_4
+; RV32IA-NEXT:    beq a5, s0, .LBB235_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a4
+; RV32IA-NEXT:    sltu a0, s0, a5
 ; RV32IA-NEXT:    j .LBB235_5
 ; RV32IA-NEXT:  .LBB235_4: # in Loop: Header=BB235_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB235_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB235_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    beqz a0, .LBB235_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB235_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB235_1
 ; RV32IA-NEXT:  .LBB235_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28580,46 +28580,46 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB236_2
 ; RV32I-NEXT:  .LBB236_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 2
 ; RV32I-NEXT:    li a5, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB236_7
 ; RV32I-NEXT:  .LBB236_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB236_4
+; RV32I-NEXT:    beq a5, s0, .LBB236_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a4
+; RV32I-NEXT:    sltu a0, s0, a5
 ; RV32I-NEXT:    j .LBB236_5
 ; RV32I-NEXT:  .LBB236_4: # in Loop: Header=BB236_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB236_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    beqz a0, .LBB236_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB236_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB236_1
 ; RV32I-NEXT:  .LBB236_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28634,46 +28634,46 @@ define i64 @atomicrmw_umin_i64_acquire(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB236_2
 ; RV32IA-NEXT:  .LBB236_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 2
 ; RV32IA-NEXT:    li a5, 2
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB236_7
 ; RV32IA-NEXT:  .LBB236_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB236_4
+; RV32IA-NEXT:    beq a5, s0, .LBB236_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a4
+; RV32IA-NEXT:    sltu a0, s0, a5
 ; RV32IA-NEXT:    j .LBB236_5
 ; RV32IA-NEXT:  .LBB236_4: # in Loop: Header=BB236_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB236_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB236_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    beqz a0, .LBB236_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB236_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB236_1
 ; RV32IA-NEXT:  .LBB236_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28738,46 +28738,46 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB237_2
 ; RV32I-NEXT:  .LBB237_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 3
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB237_7
 ; RV32I-NEXT:  .LBB237_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB237_4
+; RV32I-NEXT:    beq a5, s0, .LBB237_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a4
+; RV32I-NEXT:    sltu a0, s0, a5
 ; RV32I-NEXT:    j .LBB237_5
 ; RV32I-NEXT:  .LBB237_4: # in Loop: Header=BB237_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB237_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    beqz a0, .LBB237_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB237_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB237_1
 ; RV32I-NEXT:  .LBB237_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28792,46 +28792,46 @@ define i64 @atomicrmw_umin_i64_release(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB237_2
 ; RV32IA-NEXT:  .LBB237_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 3
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB237_7
 ; RV32IA-NEXT:  .LBB237_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB237_4
+; RV32IA-NEXT:    beq a5, s0, .LBB237_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a4
+; RV32IA-NEXT:    sltu a0, s0, a5
 ; RV32IA-NEXT:    j .LBB237_5
 ; RV32IA-NEXT:  .LBB237_4: # in Loop: Header=BB237_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB237_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB237_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    beqz a0, .LBB237_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB237_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB237_1
 ; RV32IA-NEXT:  .LBB237_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28896,46 +28896,46 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB238_2
 ; RV32I-NEXT:  .LBB238_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 4
 ; RV32I-NEXT:    li a5, 2
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB238_7
 ; RV32I-NEXT:  .LBB238_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB238_4
+; RV32I-NEXT:    beq a5, s0, .LBB238_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a4
+; RV32I-NEXT:    sltu a0, s0, a5
 ; RV32I-NEXT:    j .LBB238_5
 ; RV32I-NEXT:  .LBB238_4: # in Loop: Header=BB238_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB238_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    beqz a0, .LBB238_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB238_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB238_1
 ; RV32I-NEXT:  .LBB238_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -28950,46 +28950,46 @@ define i64 @atomicrmw_umin_i64_acq_rel(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB238_2
 ; RV32IA-NEXT:  .LBB238_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 4
 ; RV32IA-NEXT:    li a5, 2
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB238_7
 ; RV32IA-NEXT:  .LBB238_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB238_4
+; RV32IA-NEXT:    beq a5, s0, .LBB238_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a4
+; RV32IA-NEXT:    sltu a0, s0, a5
 ; RV32IA-NEXT:    j .LBB238_5
 ; RV32IA-NEXT:  .LBB238_4: # in Loop: Header=BB238_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB238_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB238_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    beqz a0, .LBB238_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB238_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB238_1
 ; RV32IA-NEXT:  .LBB238_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -29054,46 +29054,46 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB239_2
 ; RV32I-NEXT:  .LBB239_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB239_7
 ; RV32I-NEXT:  .LBB239_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB239_4
+; RV32I-NEXT:    beq a5, s0, .LBB239_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a4
+; RV32I-NEXT:    sltu a0, s0, a5
 ; RV32I-NEXT:    j .LBB239_5
 ; RV32I-NEXT:  .LBB239_4: # in Loop: Header=BB239_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB239_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    beqz a0, .LBB239_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB239_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB239_1
 ; RV32I-NEXT:  .LBB239_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -29108,46 +29108,46 @@ define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB239_2
 ; RV32IA-NEXT:  .LBB239_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB239_7
 ; RV32IA-NEXT:  .LBB239_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB239_4
+; RV32IA-NEXT:    beq a5, s0, .LBB239_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a4
+; RV32IA-NEXT:    sltu a0, s0, a5
 ; RV32IA-NEXT:    j .LBB239_5
 ; RV32IA-NEXT:  .LBB239_4: # in Loop: Header=BB239_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB239_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB239_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    beqz a0, .LBB239_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB239_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB239_1
 ; RV32IA-NEXT:  .LBB239_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll
index 06594e35be8703..aea7473ceece4f 100644
--- a/llvm/test/CodeGen/RISCV/atomic-signext.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll
@@ -142,8 +142,8 @@ define signext i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -176,8 +176,8 @@ define signext i8 @atomicrmw_xchg_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
 ; RV64IA-NEXT:    li a3, 255
-; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a4, (a2)
@@ -214,8 +214,8 @@ define signext i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -248,8 +248,8 @@ define signext i8 @atomicrmw_add_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
 ; RV64IA-NEXT:    li a3, 255
-; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:  .LBB4_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a4, (a2)
@@ -286,8 +286,8 @@ define signext i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -320,8 +320,8 @@ define signext i8 @atomicrmw_sub_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
 ; RV64IA-NEXT:    li a3, 255
-; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:  .LBB5_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a4, (a2)
@@ -358,9 +358,9 @@ define signext i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
+; RV32IA-NEXT:    andi a1, a1, 255
 ; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    not a3, a3
-; RV32IA-NEXT:    andi a1, a1, 255
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    or a1, a1, a3
 ; RV32IA-NEXT:    amoand.w a1, a1, (a2)
@@ -386,9 +386,9 @@ define signext i8 @atomicrmw_and_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
 ; RV64IA-NEXT:    li a3, 255
+; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    not a3, a3
-; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    or a1, a1, a3
 ; RV64IA-NEXT:    amoand.w a1, a1, (a2)
@@ -418,8 +418,8 @@ define signext i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -453,8 +453,8 @@ define signext i8 @atomicrmw_nand_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
 ; RV64IA-NEXT:    li a3, 255
-; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:  .LBB7_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a4, (a2)
@@ -626,23 +626,23 @@ define signext i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    li a3, 255
 ; RV32IA-NEXT:    slli a1, a1, 24
+; RV32IA-NEXT:    andi a4, a0, 24
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    srai a1, a1, 24
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    xori a3, a3, 24
+; RV32IA-NEXT:    xori a4, a4, 24
 ; RV32IA-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a5, (a2)
-; RV32IA-NEXT:    and a7, a5, a4
+; RV32IA-NEXT:    and a7, a5, a3
 ; RV32IA-NEXT:    mv a6, a5
-; RV32IA-NEXT:    sll a7, a7, a3
-; RV32IA-NEXT:    sra a7, a7, a3
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a7, a1, .LBB10_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB10_1 Depth=1
 ; RV32IA-NEXT:    xor a6, a5, a1
-; RV32IA-NEXT:    and a6, a6, a4
+; RV32IA-NEXT:    and a6, a6, a3
 ; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB10_3: # in Loop: Header=BB10_1 Depth=1
 ; RV32IA-NEXT:    sc.w a6, a6, (a2)
@@ -700,23 +700,23 @@ define signext i8 @atomicrmw_max_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
-; RV64IA-NEXT:    andi a3, a0, 24
-; RV64IA-NEXT:    li a4, 255
-; RV64IA-NEXT:    sllw a4, a4, a0
+; RV64IA-NEXT:    li a3, 255
 ; RV64IA-NEXT:    slli a1, a1, 56
+; RV64IA-NEXT:    andi a4, a0, 24
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    srai a1, a1, 56
 ; RV64IA-NEXT:    sllw a1, a1, a0
-; RV64IA-NEXT:    xori a3, a3, 56
+; RV64IA-NEXT:    xori a4, a4, 56
 ; RV64IA-NEXT:  .LBB10_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a5, (a2)
-; RV64IA-NEXT:    and a7, a5, a4
+; RV64IA-NEXT:    and a7, a5, a3
 ; RV64IA-NEXT:    mv a6, a5
-; RV64IA-NEXT:    sll a7, a7, a3
-; RV64IA-NEXT:    sra a7, a7, a3
+; RV64IA-NEXT:    sll a7, a7, a4
+; RV64IA-NEXT:    sra a7, a7, a4
 ; RV64IA-NEXT:    bge a7, a1, .LBB10_3
 ; RV64IA-NEXT:  # %bb.2: # in Loop: Header=BB10_1 Depth=1
 ; RV64IA-NEXT:    xor a6, a5, a1
-; RV64IA-NEXT:    and a6, a6, a4
+; RV64IA-NEXT:    and a6, a6, a3
 ; RV64IA-NEXT:    xor a6, a5, a6
 ; RV64IA-NEXT:  .LBB10_3: # in Loop: Header=BB10_1 Depth=1
 ; RV64IA-NEXT:    sc.w a6, a6, (a2)
@@ -778,23 +778,23 @@ define signext i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    li a3, 255
 ; RV32IA-NEXT:    slli a1, a1, 24
+; RV32IA-NEXT:    andi a4, a0, 24
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    srai a1, a1, 24
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    xori a3, a3, 24
+; RV32IA-NEXT:    xori a4, a4, 24
 ; RV32IA-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a5, (a2)
-; RV32IA-NEXT:    and a7, a5, a4
+; RV32IA-NEXT:    and a7, a5, a3
 ; RV32IA-NEXT:    mv a6, a5
-; RV32IA-NEXT:    sll a7, a7, a3
-; RV32IA-NEXT:    sra a7, a7, a3
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a1, a7, .LBB11_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB11_1 Depth=1
 ; RV32IA-NEXT:    xor a6, a5, a1
-; RV32IA-NEXT:    and a6, a6, a4
+; RV32IA-NEXT:    and a6, a6, a3
 ; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB11_3: # in Loop: Header=BB11_1 Depth=1
 ; RV32IA-NEXT:    sc.w a6, a6, (a2)
@@ -852,23 +852,23 @@ define signext i8 @atomicrmw_min_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
-; RV64IA-NEXT:    andi a3, a0, 24
-; RV64IA-NEXT:    li a4, 255
-; RV64IA-NEXT:    sllw a4, a4, a0
+; RV64IA-NEXT:    li a3, 255
 ; RV64IA-NEXT:    slli a1, a1, 56
+; RV64IA-NEXT:    andi a4, a0, 24
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    srai a1, a1, 56
 ; RV64IA-NEXT:    sllw a1, a1, a0
-; RV64IA-NEXT:    xori a3, a3, 56
+; RV64IA-NEXT:    xori a4, a4, 56
 ; RV64IA-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a5, (a2)
-; RV64IA-NEXT:    and a7, a5, a4
+; RV64IA-NEXT:    and a7, a5, a3
 ; RV64IA-NEXT:    mv a6, a5
-; RV64IA-NEXT:    sll a7, a7, a3
-; RV64IA-NEXT:    sra a7, a7, a3
+; RV64IA-NEXT:    sll a7, a7, a4
+; RV64IA-NEXT:    sra a7, a7, a4
 ; RV64IA-NEXT:    bge a1, a7, .LBB11_3
 ; RV64IA-NEXT:  # %bb.2: # in Loop: Header=BB11_1 Depth=1
 ; RV64IA-NEXT:    xor a6, a5, a1
-; RV64IA-NEXT:    and a6, a6, a4
+; RV64IA-NEXT:    and a6, a6, a3
 ; RV64IA-NEXT:    xor a6, a5, a6
 ; RV64IA-NEXT:  .LBB11_3: # in Loop: Header=BB11_1 Depth=1
 ; RV64IA-NEXT:    sc.w a6, a6, (a2)
@@ -929,8 +929,8 @@ define signext i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -996,8 +996,8 @@ define signext i8 @atomicrmw_umax_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
 ; RV64IA-NEXT:    li a3, 255
-; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a4, (a2)
@@ -1067,8 +1067,8 @@ define signext i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a3, 255
-; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a2)
@@ -1134,8 +1134,8 @@ define signext i8 @atomicrmw_umin_i8_monotonic(ptr %a, i8 %b) nounwind {
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
 ; RV64IA-NEXT:    li a3, 255
-; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:  .LBB13_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a4, (a2)
@@ -1400,10 +1400,10 @@ define signext i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    sll a4, a3, a0
-; RV32IA-NEXT:    not a4, a4
 ; RV32IA-NEXT:    and a1, a1, a3
+; RV32IA-NEXT:    not a3, a4
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    or a1, a1, a4
+; RV32IA-NEXT:    or a1, a1, a3
 ; RV32IA-NEXT:    amoand.w a1, a1, (a2)
 ; RV32IA-NEXT:    srl a0, a1, a0
 ; RV32IA-NEXT:    slli a0, a0, 16
@@ -1429,10 +1429,10 @@ define signext i16 @atomicrmw_and_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addi a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
-; RV64IA-NEXT:    not a4, a4
 ; RV64IA-NEXT:    and a1, a1, a3
+; RV64IA-NEXT:    not a3, a4
 ; RV64IA-NEXT:    sllw a1, a1, a0
-; RV64IA-NEXT:    or a1, a1, a4
+; RV64IA-NEXT:    or a1, a1, a3
 ; RV64IA-NEXT:    amoand.w a1, a1, (a2)
 ; RV64IA-NEXT:    srlw a0, a1, a0
 ; RV64IA-NEXT:    slli a0, a0, 48
@@ -1674,31 +1674,31 @@ define signext i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    lui a4, 16
-; RV32IA-NEXT:    addi a4, a4, -1
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    slli a1, a1, 16
+; RV32IA-NEXT:    li a4, 16
+; RV32IA-NEXT:    andi a5, a0, 24
+; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    srai a1, a1, 16
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    li a5, 16
-; RV32IA-NEXT:    sub a5, a5, a3
+; RV32IA-NEXT:    sub a4, a4, a5
 ; RV32IA-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    lr.w a3, (a2)
-; RV32IA-NEXT:    and a7, a3, a4
-; RV32IA-NEXT:    mv a6, a3
-; RV32IA-NEXT:    sll a7, a7, a5
-; RV32IA-NEXT:    sra a7, a7, a5
+; RV32IA-NEXT:    lr.w a5, (a2)
+; RV32IA-NEXT:    and a7, a5, a3
+; RV32IA-NEXT:    mv a6, a5
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a7, a1, .LBB21_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB21_1 Depth=1
-; RV32IA-NEXT:    xor a6, a3, a1
-; RV32IA-NEXT:    and a6, a6, a4
-; RV32IA-NEXT:    xor a6, a3, a6
+; RV32IA-NEXT:    xor a6, a5, a1
+; RV32IA-NEXT:    and a6, a6, a3
+; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB21_3: # in Loop: Header=BB21_1 Depth=1
 ; RV32IA-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-NEXT:    bnez a6, .LBB21_1
 ; RV32IA-NEXT:  # %bb.4:
-; RV32IA-NEXT:    srl a0, a3, a0
+; RV32IA-NEXT:    srl a0, a5, a0
 ; RV32IA-NEXT:    slli a0, a0, 16
 ; RV32IA-NEXT:    srai a0, a0, 16
 ; RV32IA-NEXT:    ret
@@ -1750,31 +1750,31 @@ define signext i16 @atomicrmw_max_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
-; RV64IA-NEXT:    andi a3, a0, 24
-; RV64IA-NEXT:    lui a4, 16
-; RV64IA-NEXT:    addi a4, a4, -1
-; RV64IA-NEXT:    sllw a4, a4, a0
+; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    slli a1, a1, 48
+; RV64IA-NEXT:    li a4, 48
+; RV64IA-NEXT:    andi a5, a0, 24
+; RV64IA-NEXT:    addi a3, a3, -1
 ; RV64IA-NEXT:    srai a1, a1, 48
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    sllw a1, a1, a0
-; RV64IA-NEXT:    li a5, 48
-; RV64IA-NEXT:    sub a5, a5, a3
+; RV64IA-NEXT:    sub a4, a4, a5
 ; RV64IA-NEXT:  .LBB21_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NEXT:    lr.w a3, (a2)
-; RV64IA-NEXT:    and a7, a3, a4
-; RV64IA-NEXT:    mv a6, a3
-; RV64IA-NEXT:    sll a7, a7, a5
-; RV64IA-NEXT:    sra a7, a7, a5
+; RV64IA-NEXT:    lr.w a5, (a2)
+; RV64IA-NEXT:    and a7, a5, a3
+; RV64IA-NEXT:    mv a6, a5
+; RV64IA-NEXT:    sll a7, a7, a4
+; RV64IA-NEXT:    sra a7, a7, a4
 ; RV64IA-NEXT:    bge a7, a1, .LBB21_3
 ; RV64IA-NEXT:  # %bb.2: # in Loop: Header=BB21_1 Depth=1
-; RV64IA-NEXT:    xor a6, a3, a1
-; RV64IA-NEXT:    and a6, a6, a4
-; RV64IA-NEXT:    xor a6, a3, a6
+; RV64IA-NEXT:    xor a6, a5, a1
+; RV64IA-NEXT:    and a6, a6, a3
+; RV64IA-NEXT:    xor a6, a5, a6
 ; RV64IA-NEXT:  .LBB21_3: # in Loop: Header=BB21_1 Depth=1
 ; RV64IA-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-NEXT:    bnez a6, .LBB21_1
 ; RV64IA-NEXT:  # %bb.4:
-; RV64IA-NEXT:    srlw a0, a3, a0
+; RV64IA-NEXT:    srlw a0, a5, a0
 ; RV64IA-NEXT:    slli a0, a0, 48
 ; RV64IA-NEXT:    srai a0, a0, 48
 ; RV64IA-NEXT:    ret
@@ -1830,31 +1830,31 @@ define signext i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
-; RV32IA-NEXT:    andi a3, a0, 24
-; RV32IA-NEXT:    lui a4, 16
-; RV32IA-NEXT:    addi a4, a4, -1
-; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    lui a3, 16
 ; RV32IA-NEXT:    slli a1, a1, 16
+; RV32IA-NEXT:    li a4, 16
+; RV32IA-NEXT:    andi a5, a0, 24
+; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    srai a1, a1, 16
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    sll a1, a1, a0
-; RV32IA-NEXT:    li a5, 16
-; RV32IA-NEXT:    sub a5, a5, a3
+; RV32IA-NEXT:    sub a4, a4, a5
 ; RV32IA-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    lr.w a3, (a2)
-; RV32IA-NEXT:    and a7, a3, a4
-; RV32IA-NEXT:    mv a6, a3
-; RV32IA-NEXT:    sll a7, a7, a5
-; RV32IA-NEXT:    sra a7, a7, a5
+; RV32IA-NEXT:    lr.w a5, (a2)
+; RV32IA-NEXT:    and a7, a5, a3
+; RV32IA-NEXT:    mv a6, a5
+; RV32IA-NEXT:    sll a7, a7, a4
+; RV32IA-NEXT:    sra a7, a7, a4
 ; RV32IA-NEXT:    bge a1, a7, .LBB22_3
 ; RV32IA-NEXT:  # %bb.2: # in Loop: Header=BB22_1 Depth=1
-; RV32IA-NEXT:    xor a6, a3, a1
-; RV32IA-NEXT:    and a6, a6, a4
-; RV32IA-NEXT:    xor a6, a3, a6
+; RV32IA-NEXT:    xor a6, a5, a1
+; RV32IA-NEXT:    and a6, a6, a3
+; RV32IA-NEXT:    xor a6, a5, a6
 ; RV32IA-NEXT:  .LBB22_3: # in Loop: Header=BB22_1 Depth=1
 ; RV32IA-NEXT:    sc.w a6, a6, (a2)
 ; RV32IA-NEXT:    bnez a6, .LBB22_1
 ; RV32IA-NEXT:  # %bb.4:
-; RV32IA-NEXT:    srl a0, a3, a0
+; RV32IA-NEXT:    srl a0, a5, a0
 ; RV32IA-NEXT:    slli a0, a0, 16
 ; RV32IA-NEXT:    srai a0, a0, 16
 ; RV32IA-NEXT:    ret
@@ -1906,31 +1906,31 @@ define signext i16 @atomicrmw_min_i16_monotonic(ptr %a, i16 %b) nounwind {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
-; RV64IA-NEXT:    andi a3, a0, 24
-; RV64IA-NEXT:    lui a4, 16
-; RV64IA-NEXT:    addi a4, a4, -1
-; RV64IA-NEXT:    sllw a4, a4, a0
+; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    slli a1, a1, 48
+; RV64IA-NEXT:    li a4, 48
+; RV64IA-NEXT:    andi a5, a0, 24
+; RV64IA-NEXT:    addi a3, a3, -1
 ; RV64IA-NEXT:    srai a1, a1, 48
+; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    sllw a1, a1, a0
-; RV64IA-NEXT:    li a5, 48
-; RV64IA-NEXT:    sub a5, a5, a3
+; RV64IA-NEXT:    sub a4, a4, a5
 ; RV64IA-NEXT:  .LBB22_1: # =>This Inner Loop Header: Depth=1
-; RV64IA-NEXT:    lr.w a3, (a2)
-; RV64IA-NEXT:    and a7, a3, a4
-; RV64IA-NEXT:    mv a6, a3
-; RV64IA-NEXT:    sll a7, a7, a5
-; RV64IA-NEXT:    sra a7, a7, a5
+; RV64IA-NEXT:    lr.w a5, (a2)
+; RV64IA-NEXT:    and a7, a5, a3
+; RV64IA-NEXT:    mv a6, a5
+; RV64IA-NEXT:    sll a7, a7, a4
+; RV64IA-NEXT:    sra a7, a7, a4
 ; RV64IA-NEXT:    bge a1, a7, .LBB22_3
 ; RV64IA-NEXT:  # %bb.2: # in Loop: Header=BB22_1 Depth=1
-; RV64IA-NEXT:    xor a6, a3, a1
-; RV64IA-NEXT:    and a6, a6, a4
-; RV64IA-NEXT:    xor a6, a3, a6
+; RV64IA-NEXT:    xor a6, a5, a1
+; RV64IA-NEXT:    and a6, a6, a3
+; RV64IA-NEXT:    xor a6, a5, a6
 ; RV64IA-NEXT:  .LBB22_3: # in Loop: Header=BB22_1 Depth=1
 ; RV64IA-NEXT:    sc.w a6, a6, (a2)
 ; RV64IA-NEXT:    bnez a6, .LBB22_1
 ; RV64IA-NEXT:  # %bb.4:
-; RV64IA-NEXT:    srlw a0, a3, a0
+; RV64IA-NEXT:    srlw a0, a5, a0
 ; RV64IA-NEXT:    slli a0, a0, 48
 ; RV64IA-NEXT:    srai a0, a0, 48
 ; RV64IA-NEXT:    ret
@@ -3182,46 +3182,46 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB43_2
 ; RV32I-NEXT:  .LBB43_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB43_7
 ; RV32I-NEXT:  .LBB43_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB43_4
+; RV32I-NEXT:    beq a5, s0, .LBB43_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a4
+; RV32I-NEXT:    slt a0, s0, a5
 ; RV32I-NEXT:    j .LBB43_5
 ; RV32I-NEXT:  .LBB43_4: # in Loop: Header=BB43_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB43_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    bnez a0, .LBB43_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB43_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB43_1
 ; RV32I-NEXT:  .LBB43_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3236,46 +3236,46 @@ define signext i64 @atomicrmw_max_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB43_2
 ; RV32IA-NEXT:  .LBB43_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB43_7
 ; RV32IA-NEXT:  .LBB43_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB43_4
+; RV32IA-NEXT:    beq a5, s0, .LBB43_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a4
+; RV32IA-NEXT:    slt a0, s0, a5
 ; RV32IA-NEXT:    j .LBB43_5
 ; RV32IA-NEXT:  .LBB43_4: # in Loop: Header=BB43_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB43_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB43_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    bnez a0, .LBB43_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB43_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB43_1
 ; RV32IA-NEXT:  .LBB43_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3335,46 +3335,46 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB44_2
 ; RV32I-NEXT:  .LBB44_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB44_7
 ; RV32I-NEXT:  .LBB44_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB44_4
+; RV32I-NEXT:    beq a5, s0, .LBB44_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32I-NEXT:    slt a0, s1, a4
+; RV32I-NEXT:    slt a0, s0, a5
 ; RV32I-NEXT:    j .LBB44_5
 ; RV32I-NEXT:  .LBB44_4: # in Loop: Header=BB44_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB44_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    beqz a0, .LBB44_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB44_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB44_1
 ; RV32I-NEXT:  .LBB44_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3389,46 +3389,46 @@ define signext i64 @atomicrmw_min_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB44_2
 ; RV32IA-NEXT:  .LBB44_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB44_7
 ; RV32IA-NEXT:  .LBB44_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB44_4
+; RV32IA-NEXT:    beq a5, s0, .LBB44_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32IA-NEXT:    slt a0, s1, a4
+; RV32IA-NEXT:    slt a0, s0, a5
 ; RV32IA-NEXT:    j .LBB44_5
 ; RV32IA-NEXT:  .LBB44_4: # in Loop: Header=BB44_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB44_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB44_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    beqz a0, .LBB44_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB44_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB44_1
 ; RV32IA-NEXT:  .LBB44_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3488,46 +3488,46 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB45_2
 ; RV32I-NEXT:  .LBB45_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB45_7
 ; RV32I-NEXT:  .LBB45_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB45_4
+; RV32I-NEXT:    beq a5, s0, .LBB45_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a4
+; RV32I-NEXT:    sltu a0, s0, a5
 ; RV32I-NEXT:    j .LBB45_5
 ; RV32I-NEXT:  .LBB45_4: # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB45_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    bnez a0, .LBB45_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB45_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB45_1
 ; RV32I-NEXT:  .LBB45_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3542,46 +3542,46 @@ define signext i64 @atomicrmw_umax_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB45_2
 ; RV32IA-NEXT:  .LBB45_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB45_7
 ; RV32IA-NEXT:  .LBB45_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB45_4
+; RV32IA-NEXT:    beq a5, s0, .LBB45_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a4
+; RV32IA-NEXT:    sltu a0, s0, a5
 ; RV32IA-NEXT:    j .LBB45_5
 ; RV32IA-NEXT:  .LBB45_4: # in Loop: Header=BB45_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB45_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB45_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    bnez a0, .LBB45_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB45_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB45_1
 ; RV32IA-NEXT:  .LBB45_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3641,46 +3641,46 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB46_2
 ; RV32I-NEXT:  .LBB46_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    li a4, 0
 ; RV32I-NEXT:    li a5, 0
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB46_7
 ; RV32I-NEXT:  .LBB46_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB46_4
+; RV32I-NEXT:    beq a5, s0, .LBB46_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a4
+; RV32I-NEXT:    sltu a0, s0, a5
 ; RV32I-NEXT:    j .LBB46_5
 ; RV32I-NEXT:  .LBB46_4: # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    sltu a0, s2, a5
+; RV32I-NEXT:    sltu a0, s2, a4
 ; RV32I-NEXT:  .LBB46_5: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32I-NEXT:    mv a2, a5
-; RV32I-NEXT:    mv a3, a4
+; RV32I-NEXT:    mv a2, a4
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:    beqz a0, .LBB46_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB46_2 Depth=1
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    j .LBB46_1
 ; RV32I-NEXT:  .LBB46_7: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3695,46 +3695,46 @@ define signext i64 @atomicrmw_umin_i64_monotonic(ptr %a, i64 %b) nounwind {
 ; RV32IA-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32IA-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB46_2
 ; RV32IA-NEXT:  .LBB46_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    li a4, 0
 ; RV32IA-NEXT:    li a5, 0
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB46_7
 ; RV32IA-NEXT:  .LBB46_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB46_4
+; RV32IA-NEXT:    beq a5, s0, .LBB46_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a4
+; RV32IA-NEXT:    sltu a0, s0, a5
 ; RV32IA-NEXT:    j .LBB46_5
 ; RV32IA-NEXT:  .LBB46_4: # in Loop: Header=BB46_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s2, a5
+; RV32IA-NEXT:    sltu a0, s2, a4
 ; RV32IA-NEXT:  .LBB46_5: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB46_2 Depth=1
-; RV32IA-NEXT:    mv a2, a5
-; RV32IA-NEXT:    mv a3, a4
+; RV32IA-NEXT:    mv a2, a4
+; RV32IA-NEXT:    mv a3, a5
 ; RV32IA-NEXT:    beqz a0, .LBB46_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB46_2 Depth=1
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    j .LBB46_1
 ; RV32IA-NEXT:  .LBB46_7: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -3806,10 +3806,10 @@ define signext i8 @cmpxchg_i8_monotonic_monotonic_val0(ptr %ptr, i8 signext %cmp
 ; RV32IA-NEXT:    andi a3, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a2, a2, a0
 ; RV32IA-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a5, (a3)
@@ -3846,10 +3846,10 @@ define signext i8 @cmpxchg_i8_monotonic_monotonic_val0(ptr %ptr, i8 signext %cmp
 ; RV64IA-NEXT:    andi a3, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
 ; RV64IA-NEXT:    li a4, 255
-; RV64IA-NEXT:    sllw a4, a4, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
-; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    andi a2, a2, 255
+; RV64IA-NEXT:    sllw a4, a4, a0
+; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    sllw a2, a2, a0
 ; RV64IA-NEXT:  .LBB47_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a5, (a3)
@@ -3890,10 +3890,10 @@ define i1 @cmpxchg_i8_monotonic_monotonic_val1(ptr %ptr, i8 signext %cmp, i8 sig
 ; RV32IA-NEXT:    andi a3, a0, -4
 ; RV32IA-NEXT:    slli a0, a0, 3
 ; RV32IA-NEXT:    li a4, 255
-; RV32IA-NEXT:    sll a4, a4, a0
 ; RV32IA-NEXT:    andi a1, a1, 255
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    andi a2, a2, 255
+; RV32IA-NEXT:    sll a4, a4, a0
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a2, (a3)
@@ -3929,10 +3929,10 @@ define i1 @cmpxchg_i8_monotonic_monotonic_val1(ptr %ptr, i8 signext %cmp, i8 sig
 ; RV64IA-NEXT:    andi a3, a0, -4
 ; RV64IA-NEXT:    slli a0, a0, 3
 ; RV64IA-NEXT:    li a4, 255
-; RV64IA-NEXT:    sllw a4, a4, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
-; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    andi a2, a2, 255
+; RV64IA-NEXT:    sllw a4, a4, a0
+; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    sllw a0, a2, a0
 ; RV64IA-NEXT:  .LBB48_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a2, (a3)
@@ -3977,8 +3977,8 @@ define signext i16 @cmpxchg_i16_monotonic_monotonic_val0(ptr %ptr, i16 signext %
 ; RV32IA-NEXT:    addi a4, a4, -1
 ; RV32IA-NEXT:    sll a5, a4, a0
 ; RV32IA-NEXT:    and a1, a1, a4
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    and a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a2, a2, a0
 ; RV32IA-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a4, (a3)
@@ -4018,8 +4018,8 @@ define signext i16 @cmpxchg_i16_monotonic_monotonic_val0(ptr %ptr, i16 signext %
 ; RV64IA-NEXT:    addi a4, a4, -1
 ; RV64IA-NEXT:    sllw a5, a4, a0
 ; RV64IA-NEXT:    and a1, a1, a4
-; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    and a2, a2, a4
+; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    sllw a2, a2, a0
 ; RV64IA-NEXT:  .LBB49_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a4, (a3)
@@ -4063,8 +4063,8 @@ define i1 @cmpxchg_i16_monotonic_monotonic_val1(ptr %ptr, i16 signext %cmp, i16
 ; RV32IA-NEXT:    addi a4, a4, -1
 ; RV32IA-NEXT:    sll a5, a4, a0
 ; RV32IA-NEXT:    and a1, a1, a4
-; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    and a2, a2, a4
+; RV32IA-NEXT:    sll a1, a1, a0
 ; RV32IA-NEXT:    sll a0, a2, a0
 ; RV32IA-NEXT:  .LBB50_1: # =>This Inner Loop Header: Depth=1
 ; RV32IA-NEXT:    lr.w a2, (a3)
@@ -4103,8 +4103,8 @@ define i1 @cmpxchg_i16_monotonic_monotonic_val1(ptr %ptr, i16 signext %cmp, i16
 ; RV64IA-NEXT:    addi a4, a4, -1
 ; RV64IA-NEXT:    sllw a5, a4, a0
 ; RV64IA-NEXT:    and a1, a1, a4
-; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    and a2, a2, a4
+; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    sllw a0, a2, a0
 ; RV64IA-NEXT:  .LBB50_1: # =>This Inner Loop Header: Depth=1
 ; RV64IA-NEXT:    lr.w a2, (a3)

diff  --git a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
index f22115130117a8..34b29ea1dc6c2b 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-cond-sub-clamp.ll
@@ -63,8 +63,8 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a3, a0, 3
-; RV32IA-NEXT:    andi a0, a3, 24
 ; RV32IA-NEXT:    li a4, 255
+; RV32IA-NEXT:    andi a0, a3, 24
 ; RV32IA-NEXT:    lw a5, 0(a2)
 ; RV32IA-NEXT:    sll a3, a4, a3
 ; RV32IA-NEXT:    not a3, a3
@@ -146,8 +146,8 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a4, a0, 3
-; RV64IA-NEXT:    andi a0, a4, 24
 ; RV64IA-NEXT:    li a5, 255
+; RV64IA-NEXT:    andi a0, a4, 24
 ; RV64IA-NEXT:    lw a3, 0(a2)
 ; RV64IA-NEXT:    sllw a4, a5, a4
 ; RV64IA-NEXT:    not a4, a4
@@ -239,8 +239,8 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a4, a0, 3
-; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    lw a6, 0(a2)
 ; RV32IA-NEXT:    sll a4, a3, a4
@@ -329,8 +329,8 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a5, a0, 3
-; RV64IA-NEXT:    andi a0, a5, 24
 ; RV64IA-NEXT:    lui a3, 16
+; RV64IA-NEXT:    andi a0, a5, 24
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    lw a4, 0(a2)
 ; RV64IA-NEXT:    sllw a5, a3, a5
@@ -518,44 +518,44 @@ define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    .cfi_offset s0, -8
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB3_3
 ; RV32I-NEXT:  .LBB3_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    sltu a0, a4, s1
+; RV32I-NEXT:    sltu a0, a5, s0
 ; RV32I-NEXT:  .LBB3_2: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
 ; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    and a1, a0, s2
-; RV32I-NEXT:    sltu a2, a5, a1
-; RV32I-NEXT:    and a0, a0, s1
-; RV32I-NEXT:    sub a3, a4, a0
-; RV32I-NEXT:    sub a3, a3, a2
-; RV32I-NEXT:    sub a2, a5, a1
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    and a0, a0, s0
+; RV32I-NEXT:    sltu a3, a4, a1
+; RV32I-NEXT:    sub a0, a5, a0
+; RV32I-NEXT:    sub a2, a4, a1
+; RV32I-NEXT:    sub a3, a0, a3
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB3_5
 ; RV32I-NEXT:  .LBB3_3: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    bne a4, s1, .LBB3_1
+; RV32I-NEXT:    bne a5, s0, .LBB3_1
 ; RV32I-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    sltu a0, a5, s2
+; RV32I-NEXT:    sltu a0, a4, s2
 ; RV32I-NEXT:    j .LBB3_2
 ; RV32I-NEXT:  .LBB3_5: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -580,44 +580,44 @@ define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    .cfi_offset s0, -8
 ; RV32IA-NEXT:    .cfi_offset s1, -12
 ; RV32IA-NEXT:    .cfi_offset s2, -16
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB3_3
 ; RV32IA-NEXT:  .LBB3_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    sltu a0, a4, s1
+; RV32IA-NEXT:    sltu a0, a5, s0
 ; RV32IA-NEXT:  .LBB3_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
 ; RV32IA-NEXT:    xori a0, a0, 1
 ; RV32IA-NEXT:    neg a0, a0
 ; RV32IA-NEXT:    and a1, a0, s2
-; RV32IA-NEXT:    sltu a2, a5, a1
-; RV32IA-NEXT:    and a0, a0, s1
-; RV32IA-NEXT:    sub a3, a4, a0
-; RV32IA-NEXT:    sub a3, a3, a2
-; RV32IA-NEXT:    sub a2, a5, a1
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    and a0, a0, s0
+; RV32IA-NEXT:    sltu a3, a4, a1
+; RV32IA-NEXT:    sub a0, a5, a0
+; RV32IA-NEXT:    sub a2, a4, a1
+; RV32IA-NEXT:    sub a3, a0, a3
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB3_5
 ; RV32IA-NEXT:  .LBB3_3: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    bne a4, s1, .LBB3_1
+; RV32IA-NEXT:    bne a5, s0, .LBB3_1
 ; RV32IA-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    sltu a0, a5, s2
+; RV32IA-NEXT:    sltu a0, a4, s2
 ; RV32IA-NEXT:    j .LBB3_2
 ; RV32IA-NEXT:  .LBB3_5: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -741,11 +741,11 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; RV32IA-LABEL: atomicrmw_usub_sat_i8:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
-; RV32IA-NEXT:    slli a3, a0, 3
-; RV32IA-NEXT:    andi a0, a3, 24
-; RV32IA-NEXT:    li a5, 255
+; RV32IA-NEXT:    slli a0, a0, 3
+; RV32IA-NEXT:    li a3, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    lw a4, 0(a2)
-; RV32IA-NEXT:    sll a3, a5, a3
+; RV32IA-NEXT:    andi a0, a0, 24
 ; RV32IA-NEXT:    not a3, a3
 ; RV32IA-NEXT:    andi a1, a1, 255
 ; RV32IA-NEXT:  .LBB4_1: # %atomicrmw.start
@@ -818,11 +818,11 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; RV64IA-LABEL: atomicrmw_usub_sat_i8:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a4, a0, 3
-; RV64IA-NEXT:    andi a0, a4, 24
-; RV64IA-NEXT:    li a5, 255
+; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    li a3, 255
+; RV64IA-NEXT:    sllw a4, a3, a0
 ; RV64IA-NEXT:    lw a3, 0(a2)
-; RV64IA-NEXT:    sllw a4, a5, a4
+; RV64IA-NEXT:    andi a0, a0, 24
 ; RV64IA-NEXT:    not a4, a4
 ; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:  .LBB4_1: # %atomicrmw.start
@@ -906,8 +906,8 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a4, a0, 3
-; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    lw a5, 0(a2)
 ; RV32IA-NEXT:    sll a4, a3, a4
@@ -990,8 +990,8 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a5, a0, 3
-; RV64IA-NEXT:    andi a0, a5, 24
 ; RV64IA-NEXT:    lui a3, 16
+; RV64IA-NEXT:    andi a0, a5, 24
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    lw a4, 0(a2)
 ; RV64IA-NEXT:    sllw a5, a3, a5
@@ -1172,43 +1172,43 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    .cfi_offset s0, -8
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB7_3
 ; RV32I-NEXT:  .LBB7_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_3 Depth=1
-; RV32I-NEXT:    sltu a2, a4, a0
+; RV32I-NEXT:    sltu a2, a5, a0
 ; RV32I-NEXT:  .LBB7_2: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_3 Depth=1
 ; RV32I-NEXT:    addi a3, a2, -1
 ; RV32I-NEXT:    and a2, a3, a1
 ; RV32I-NEXT:    and a3, a3, a0
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB7_5
 ; RV32I-NEXT:  .LBB7_3: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    sltu a0, a5, s2
-; RV32I-NEXT:    sub a1, a4, s1
+; RV32I-NEXT:    sltu a0, a4, s2
+; RV32I-NEXT:    sub a1, a5, s0
 ; RV32I-NEXT:    sub a0, a1, a0
-; RV32I-NEXT:    sub a1, a5, s2
-; RV32I-NEXT:    bne a0, a4, .LBB7_1
+; RV32I-NEXT:    sub a1, a4, s2
+; RV32I-NEXT:    bne a0, a5, .LBB7_1
 ; RV32I-NEXT:  # %bb.4: # in Loop: Header=BB7_3 Depth=1
-; RV32I-NEXT:    sltu a2, a5, a1
+; RV32I-NEXT:    sltu a2, a4, a1
 ; RV32I-NEXT:    j .LBB7_2
 ; RV32I-NEXT:  .LBB7_5: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -1233,43 +1233,43 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    .cfi_offset s0, -8
 ; RV32IA-NEXT:    .cfi_offset s1, -12
 ; RV32IA-NEXT:    .cfi_offset s2, -16
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB7_3
 ; RV32IA-NEXT:  .LBB7_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_3 Depth=1
-; RV32IA-NEXT:    sltu a2, a4, a0
+; RV32IA-NEXT:    sltu a2, a5, a0
 ; RV32IA-NEXT:  .LBB7_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_3 Depth=1
 ; RV32IA-NEXT:    addi a3, a2, -1
 ; RV32IA-NEXT:    and a2, a3, a1
 ; RV32IA-NEXT:    and a3, a3, a0
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB7_5
 ; RV32IA-NEXT:  .LBB7_3: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    sltu a0, a5, s2
-; RV32IA-NEXT:    sub a1, a4, s1
+; RV32IA-NEXT:    sltu a0, a4, s2
+; RV32IA-NEXT:    sub a1, a5, s0
 ; RV32IA-NEXT:    sub a0, a1, a0
-; RV32IA-NEXT:    sub a1, a5, s2
-; RV32IA-NEXT:    bne a0, a4, .LBB7_1
+; RV32IA-NEXT:    sub a1, a4, s2
+; RV32IA-NEXT:    bne a0, a5, .LBB7_1
 ; RV32IA-NEXT:  # %bb.4: # in Loop: Header=BB7_3 Depth=1
-; RV32IA-NEXT:    sltu a2, a5, a1
+; RV32IA-NEXT:    sltu a2, a4, a1
 ; RV32IA-NEXT:    j .LBB7_2
 ; RV32IA-NEXT:  .LBB7_5: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
index 14a1e6b9827174..3ff01e4987bd5c 100644
--- a/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/RISCV/atomicrmw-uinc-udec-wrap.ll
@@ -57,11 +57,11 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; RV32IA-LABEL: atomicrmw_uinc_wrap_i8:
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
-; RV32IA-NEXT:    slli a3, a0, 3
-; RV32IA-NEXT:    andi a0, a3, 24
-; RV32IA-NEXT:    li a5, 255
+; RV32IA-NEXT:    slli a0, a0, 3
+; RV32IA-NEXT:    li a3, 255
+; RV32IA-NEXT:    sll a3, a3, a0
 ; RV32IA-NEXT:    lw a4, 0(a2)
-; RV32IA-NEXT:    sll a3, a5, a3
+; RV32IA-NEXT:    andi a0, a0, 24
 ; RV32IA-NEXT:    not a3, a3
 ; RV32IA-NEXT:    andi a1, a1, 255
 ; RV32IA-NEXT:  .LBB0_1: # %atomicrmw.start
@@ -135,11 +135,11 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; RV64IA-LABEL: atomicrmw_uinc_wrap_i8:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a4, a0, 3
-; RV64IA-NEXT:    andi a0, a4, 24
-; RV64IA-NEXT:    li a5, 255
+; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    li a3, 255
+; RV64IA-NEXT:    sllw a4, a3, a0
 ; RV64IA-NEXT:    lw a3, 0(a2)
-; RV64IA-NEXT:    sllw a4, a5, a4
+; RV64IA-NEXT:    andi a0, a0, 24
 ; RV64IA-NEXT:    not a4, a4
 ; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:  .LBB0_1: # %atomicrmw.start
@@ -224,8 +224,8 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a4, a0, 3
-; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    lw a5, 0(a2)
 ; RV32IA-NEXT:    sll a4, a3, a4
@@ -239,8 +239,8 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; RV32IA-NEXT:    and a7, a5, a3
 ; RV32IA-NEXT:    addi a5, a5, 1
 ; RV32IA-NEXT:    sltu a7, a7, a1
-; RV32IA-NEXT:    neg a7, a7
 ; RV32IA-NEXT:    and a5, a5, a3
+; RV32IA-NEXT:    neg a7, a7
 ; RV32IA-NEXT:    and a5, a7, a5
 ; RV32IA-NEXT:    sll a5, a5, a0
 ; RV32IA-NEXT:    and a7, a6, a4
@@ -309,8 +309,8 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a5, a0, 3
-; RV64IA-NEXT:    andi a0, a5, 24
 ; RV64IA-NEXT:    lui a3, 16
+; RV64IA-NEXT:    andi a0, a5, 24
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    lw a4, 0(a2)
 ; RV64IA-NEXT:    sllw a5, a3, a5
@@ -324,8 +324,8 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; RV64IA-NEXT:    and t0, a6, a3
 ; RV64IA-NEXT:    addi a6, a6, 1
 ; RV64IA-NEXT:    sltu t0, t0, a1
-; RV64IA-NEXT:    negw t0, t0
 ; RV64IA-NEXT:    and a6, a6, a3
+; RV64IA-NEXT:    negw t0, t0
 ; RV64IA-NEXT:    and a6, t0, a6
 ; RV64IA-NEXT:    sllw a6, a6, a0
 ; RV64IA-NEXT:    and a4, a4, a5
@@ -493,42 +493,42 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    .cfi_offset s0, -8
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    lw a5, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB3_3
 ; RV32I-NEXT:  .LBB3_1: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    sltu a0, a4, s1
+; RV32I-NEXT:    sltu a0, a5, s0
 ; RV32I-NEXT:  .LBB3_2: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    addi a1, a5, 1
-; RV32I-NEXT:    seqz a2, a1
-; RV32I-NEXT:    add a3, a4, a2
+; RV32I-NEXT:    addi a1, a4, 1
 ; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    seqz a3, a1
 ; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    add a3, a5, a3
 ; RV32I-NEXT:    and a3, a0, a3
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
-; RV32I-NEXT:    lw a5, 8(sp)
-; RV32I-NEXT:    lw a4, 12(sp)
+; RV32I-NEXT:    lw a4, 8(sp)
+; RV32I-NEXT:    lw a5, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB3_5
 ; RV32I-NEXT:  .LBB3_3: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    bne a4, s1, .LBB3_1
+; RV32I-NEXT:    bne a5, s0, .LBB3_1
 ; RV32I-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
-; RV32I-NEXT:    sltu a0, a5, s2
+; RV32I-NEXT:    sltu a0, a4, s2
 ; RV32I-NEXT:    j .LBB3_2
 ; RV32I-NEXT:  .LBB3_5: # %atomicrmw.end
-; RV32I-NEXT:    mv a0, a5
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a0, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -553,42 +553,42 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    .cfi_offset s0, -8
 ; RV32IA-NEXT:    .cfi_offset s1, -12
 ; RV32IA-NEXT:    .cfi_offset s2, -16
-; RV32IA-NEXT:    mv s0, a0
-; RV32IA-NEXT:    lw a5, 0(a0)
-; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
+; RV32IA-NEXT:    lw a4, 0(a0)
+; RV32IA-NEXT:    lw a5, 4(a0)
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB3_3
 ; RV32IA-NEXT:  .LBB3_1: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    sltu a0, a4, s1
+; RV32IA-NEXT:    sltu a0, a5, s0
 ; RV32IA-NEXT:  .LBB3_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    addi a1, a5, 1
-; RV32IA-NEXT:    seqz a2, a1
-; RV32IA-NEXT:    add a3, a4, a2
+; RV32IA-NEXT:    addi a1, a4, 1
 ; RV32IA-NEXT:    neg a0, a0
+; RV32IA-NEXT:    seqz a3, a1
 ; RV32IA-NEXT:    and a2, a0, a1
+; RV32IA-NEXT:    add a3, a5, a3
 ; RV32IA-NEXT:    and a3, a0, a3
-; RV32IA-NEXT:    sw a5, 8(sp)
-; RV32IA-NEXT:    sw a4, 12(sp)
+; RV32IA-NEXT:    sw a4, 8(sp)
+; RV32IA-NEXT:    sw a5, 12(sp)
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
-; RV32IA-NEXT:    lw a5, 8(sp)
-; RV32IA-NEXT:    lw a4, 12(sp)
+; RV32IA-NEXT:    lw a4, 8(sp)
+; RV32IA-NEXT:    lw a5, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB3_5
 ; RV32IA-NEXT:  .LBB3_3: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    bne a4, s1, .LBB3_1
+; RV32IA-NEXT:    bne a5, s0, .LBB3_1
 ; RV32IA-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
-; RV32IA-NEXT:    sltu a0, a5, s2
+; RV32IA-NEXT:    sltu a0, a4, s2
 ; RV32IA-NEXT:    j .LBB3_2
 ; RV32IA-NEXT:  .LBB3_5: # %atomicrmw.end
-; RV32IA-NEXT:    mv a0, a5
-; RV32IA-NEXT:    mv a1, a4
+; RV32IA-NEXT:    mv a0, a4
+; RV32IA-NEXT:    mv a1, a5
 ; RV32IA-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32IA-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -726,8 +726,8 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a3, a0, 3
-; RV32IA-NEXT:    andi a0, a3, 24
 ; RV32IA-NEXT:    li a4, 255
+; RV32IA-NEXT:    andi a0, a3, 24
 ; RV32IA-NEXT:    lw a6, 0(a2)
 ; RV32IA-NEXT:    sll a3, a4, a3
 ; RV32IA-NEXT:    not a3, a3
@@ -827,8 +827,8 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a4, a0, 3
-; RV64IA-NEXT:    andi a0, a4, 24
 ; RV64IA-NEXT:    li a5, 255
+; RV64IA-NEXT:    andi a0, a4, 24
 ; RV64IA-NEXT:    lw a3, 0(a2)
 ; RV64IA-NEXT:    sllw a4, a5, a4
 ; RV64IA-NEXT:    not a4, a4
@@ -938,8 +938,8 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; RV32IA:       # %bb.0:
 ; RV32IA-NEXT:    andi a2, a0, -4
 ; RV32IA-NEXT:    slli a4, a0, 3
-; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    lui a3, 16
+; RV32IA-NEXT:    andi a0, a4, 24
 ; RV32IA-NEXT:    addi a3, a3, -1
 ; RV32IA-NEXT:    lw a7, 0(a2)
 ; RV32IA-NEXT:    sll a4, a3, a4
@@ -1046,8 +1046,8 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
 ; RV64IA-NEXT:    slli a5, a0, 3
-; RV64IA-NEXT:    andi a0, a5, 24
 ; RV64IA-NEXT:    lui a3, 16
+; RV64IA-NEXT:    andi a0, a5, 24
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    lw a4, 0(a2)
 ; RV64IA-NEXT:    sllw a5, a3, a5
@@ -1281,10 +1281,10 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    .cfi_offset s0, -8
 ; RV32I-NEXT:    .cfi_offset s1, -12
 ; RV32I-NEXT:    .cfi_offset s2, -16
-; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    lw a5, 0(a0)
 ; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    mv s1, a2
 ; RV32I-NEXT:    mv s2, a1
 ; RV32I-NEXT:    j .LBB7_2
 ; RV32I-NEXT:  .LBB7_1: # %atomicrmw.start
@@ -1294,17 +1294,17 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 5
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __atomic_compare_exchange_8
 ; RV32I-NEXT:    lw a5, 8(sp)
 ; RV32I-NEXT:    lw a4, 12(sp)
 ; RV32I-NEXT:    bnez a0, .LBB7_7
 ; RV32I-NEXT:  .LBB7_2: # %atomicrmw.start
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    beq a4, s1, .LBB7_4
+; RV32I-NEXT:    beq a4, s0, .LBB7_4
 ; RV32I-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32I-NEXT:    sltu a0, s1, a4
+; RV32I-NEXT:    sltu a0, s0, a4
 ; RV32I-NEXT:    j .LBB7_5
 ; RV32I-NEXT:  .LBB7_4: # in Loop: Header=BB7_2 Depth=1
 ; RV32I-NEXT:    sltu a0, s2, a5
@@ -1314,7 +1314,7 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    mv a2, s2
-; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    bnez a0, .LBB7_1
 ; RV32I-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32I-NEXT:    # in Loop: Header=BB7_2 Depth=1
@@ -1349,10 +1349,10 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    .cfi_offset s0, -8
 ; RV32IA-NEXT:    .cfi_offset s1, -12
 ; RV32IA-NEXT:    .cfi_offset s2, -16
-; RV32IA-NEXT:    mv s0, a0
+; RV32IA-NEXT:    mv s0, a2
+; RV32IA-NEXT:    mv s1, a0
 ; RV32IA-NEXT:    lw a5, 0(a0)
 ; RV32IA-NEXT:    lw a4, 4(a0)
-; RV32IA-NEXT:    mv s1, a2
 ; RV32IA-NEXT:    mv s2, a1
 ; RV32IA-NEXT:    j .LBB7_2
 ; RV32IA-NEXT:  .LBB7_1: # %atomicrmw.start
@@ -1362,17 +1362,17 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    addi a1, sp, 8
 ; RV32IA-NEXT:    li a4, 5
 ; RV32IA-NEXT:    li a5, 5
-; RV32IA-NEXT:    mv a0, s0
+; RV32IA-NEXT:    mv a0, s1
 ; RV32IA-NEXT:    call __atomic_compare_exchange_8
 ; RV32IA-NEXT:    lw a5, 8(sp)
 ; RV32IA-NEXT:    lw a4, 12(sp)
 ; RV32IA-NEXT:    bnez a0, .LBB7_7
 ; RV32IA-NEXT:  .LBB7_2: # %atomicrmw.start
 ; RV32IA-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32IA-NEXT:    beq a4, s1, .LBB7_4
+; RV32IA-NEXT:    beq a4, s0, .LBB7_4
 ; RV32IA-NEXT:  # %bb.3: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_2 Depth=1
-; RV32IA-NEXT:    sltu a0, s1, a4
+; RV32IA-NEXT:    sltu a0, s0, a4
 ; RV32IA-NEXT:    j .LBB7_5
 ; RV32IA-NEXT:  .LBB7_4: # in Loop: Header=BB7_2 Depth=1
 ; RV32IA-NEXT:    sltu a0, s2, a5
@@ -1382,7 +1382,7 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; RV32IA-NEXT:    seqz a1, a1
 ; RV32IA-NEXT:    or a0, a1, a0
 ; RV32IA-NEXT:    mv a2, s2
-; RV32IA-NEXT:    mv a3, s1
+; RV32IA-NEXT:    mv a3, s0
 ; RV32IA-NEXT:    bnez a0, .LBB7_1
 ; RV32IA-NEXT:  # %bb.6: # %atomicrmw.start
 ; RV32IA-NEXT:    # in Loop: Header=BB7_2 Depth=1

diff  --git a/llvm/test/CodeGen/RISCV/avgceils.ll b/llvm/test/CodeGen/RISCV/avgceils.ll
index 2ff4ad3b3b4624..64410fad6029aa 100644
--- a/llvm/test/CodeGen/RISCV/avgceils.ll
+++ b/llvm/test/CodeGen/RISCV/avgceils.ll
@@ -12,8 +12,8 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind {
 ; RV32I-LABEL: test_fixed_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    addi a0, a0, 1
@@ -23,8 +23,8 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind {
 ; RV64I-LABEL: test_fixed_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    addi a0, a0, 1
@@ -41,8 +41,8 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind {
 ; RV32I-LABEL: test_ext_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    addi a0, a0, 1
@@ -52,8 +52,8 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind {
 ; RV64I-LABEL: test_ext_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    addi a0, a0, 1
@@ -72,8 +72,8 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind {
 ; RV32I-LABEL: test_fixed_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    addi a0, a0, 1
@@ -83,8 +83,8 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind {
 ; RV64I-LABEL: test_fixed_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    addi a0, a0, 1
@@ -101,8 +101,8 @@ define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind {
 ; RV32I-LABEL: test_ext_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    addi a0, a0, 1
@@ -112,8 +112,8 @@ define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind {
 ; RV64I-LABEL: test_ext_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    addi a0, a0, 1
@@ -183,13 +183,13 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    or a4, a1, a3
 ; RV32I-NEXT:    xor a1, a1, a3
-; RV32I-NEXT:    srai a3, a1, 1
-; RV32I-NEXT:    sub a4, a4, a3
-; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    xor a3, a0, a2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srai a2, a1, 1
+; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    srli a3, a3, 1
+; RV32I-NEXT:    sub a4, a4, a2
 ; RV32I-NEXT:    or a3, a3, a1
-; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    sltu a1, a0, a3
 ; RV32I-NEXT:    sub a1, a4, a1
 ; RV32I-NEXT:    sub a0, a0, a3
@@ -214,13 +214,13 @@ define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    or a4, a1, a3
 ; RV32I-NEXT:    xor a1, a1, a3
-; RV32I-NEXT:    srai a3, a1, 1
-; RV32I-NEXT:    sub a4, a4, a3
-; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    xor a3, a0, a2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srai a2, a1, 1
+; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    srli a3, a3, 1
+; RV32I-NEXT:    sub a4, a4, a2
 ; RV32I-NEXT:    or a3, a3, a1
-; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    sltu a1, a0, a3
 ; RV32I-NEXT:    sub a1, a4, a1
 ; RV32I-NEXT:    sub a0, a0, a3

diff  --git a/llvm/test/CodeGen/RISCV/avgceilu.ll b/llvm/test/CodeGen/RISCV/avgceilu.ll
index cc12b585036abb..924a50a836ddaf 100644
--- a/llvm/test/CodeGen/RISCV/avgceilu.ll
+++ b/llvm/test/CodeGen/RISCV/avgceilu.ll
@@ -132,8 +132,8 @@ define i32 @test_fixed_i32(i32 %a0, i32 %a1) nounwind {
 ; RV64I-LABEL: test_fixed_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    addi a0, a0, 1
@@ -158,8 +158,8 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind {
 ; RV64I-LABEL: test_ext_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    addi a0, a0, 1
@@ -179,13 +179,13 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    or a4, a1, a3
 ; RV32I-NEXT:    xor a1, a1, a3
-; RV32I-NEXT:    srli a3, a1, 1
-; RV32I-NEXT:    sub a4, a4, a3
-; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    xor a3, a0, a2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a1, 1
+; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    srli a3, a3, 1
+; RV32I-NEXT:    sub a4, a4, a2
 ; RV32I-NEXT:    or a3, a3, a1
-; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    sltu a1, a0, a3
 ; RV32I-NEXT:    sub a1, a4, a1
 ; RV32I-NEXT:    sub a0, a0, a3
@@ -210,13 +210,13 @@ define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    or a4, a1, a3
 ; RV32I-NEXT:    xor a1, a1, a3
-; RV32I-NEXT:    srli a3, a1, 1
-; RV32I-NEXT:    sub a4, a4, a3
-; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    xor a3, a0, a2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a1, 1
+; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    srli a3, a3, 1
+; RV32I-NEXT:    sub a4, a4, a2
 ; RV32I-NEXT:    or a3, a3, a1
-; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    sltu a1, a0, a3
 ; RV32I-NEXT:    sub a1, a4, a1
 ; RV32I-NEXT:    sub a0, a0, a3

diff  --git a/llvm/test/CodeGen/RISCV/avgfloors.ll b/llvm/test/CodeGen/RISCV/avgfloors.ll
index b36177de021d1b..b321f4c2f29395 100644
--- a/llvm/test/CodeGen/RISCV/avgfloors.ll
+++ b/llvm/test/CodeGen/RISCV/avgfloors.ll
@@ -12,8 +12,8 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind {
 ; RV32I-LABEL: test_fixed_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    srai a0, a0, 1
@@ -22,8 +22,8 @@ define i8 @test_fixed_i8(i8 %a0, i8 %a1) nounwind {
 ; RV64I-LABEL: test_fixed_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    srai a0, a0, 1
@@ -39,8 +39,8 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind {
 ; RV32I-LABEL: test_ext_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    srai a0, a0, 1
@@ -49,8 +49,8 @@ define i8 @test_ext_i8(i8 %a0, i8 %a1) nounwind {
 ; RV64I-LABEL: test_ext_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 56
-; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    srai a0, a0, 1
@@ -67,8 +67,8 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind {
 ; RV32I-LABEL: test_fixed_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    srai a0, a0, 1
@@ -77,8 +77,8 @@ define i16 @test_fixed_i16(i16 %a0, i16 %a1) nounwind {
 ; RV64I-LABEL: test_fixed_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    srai a0, a0, 1
@@ -94,8 +94,8 @@ define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind {
 ; RV32I-LABEL: test_ext_i16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    srai a0, a0, 1
@@ -104,8 +104,8 @@ define i16 @test_ext_i16(i16 %a0, i16 %a1) nounwind {
 ; RV64I-LABEL: test_ext_i16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    srai a0, a0, 1
@@ -172,8 +172,8 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    srai a3, a1, 1
 ; RV32I-NEXT:    add a3, a4, a3
-; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    xor a4, a0, a2
+; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    srli a4, a4, 1
 ; RV32I-NEXT:    or a1, a4, a1
 ; RV32I-NEXT:    and a2, a0, a2
@@ -203,8 +203,8 @@ define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    srai a3, a1, 1
 ; RV32I-NEXT:    add a3, a4, a3
-; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    xor a4, a0, a2
+; RV32I-NEXT:    slli a1, a1, 31
 ; RV32I-NEXT:    srli a4, a4, 1
 ; RV32I-NEXT:    or a1, a4, a1
 ; RV32I-NEXT:    and a2, a0, a2

diff  --git a/llvm/test/CodeGen/RISCV/avgflooru.ll b/llvm/test/CodeGen/RISCV/avgflooru.ll
index fa88c3760e455d..550cc3136bbc33 100644
--- a/llvm/test/CodeGen/RISCV/avgflooru.ll
+++ b/llvm/test/CodeGen/RISCV/avgflooru.ll
@@ -122,8 +122,8 @@ define i32 @test_fixed_i32(i32 %a0, i32 %a1) nounwind {
 ; RV64I-LABEL: test_fixed_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    srli a0, a0, 1
@@ -147,8 +147,8 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind {
 ; RV64I-LABEL: test_ext_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    srli a0, a0, 1
@@ -164,20 +164,20 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind {
 define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I-LABEL: test_fixed_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    add a4, a3, a1
+; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    sltu a1, a0, a2
-; RV32I-NEXT:    add a2, a4, a1
-; RV32I-NEXT:    beq a2, a3, .LBB6_2
+; RV32I-NEXT:    sltu a2, a0, a2
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    beq a1, a3, .LBB6_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu a1, a2, a3
+; RV32I-NEXT:    sltu a2, a1, a3
 ; RV32I-NEXT:  .LBB6_2:
-; RV32I-NEXT:    slli a1, a1, 31
-; RV32I-NEXT:    srli a3, a2, 1
-; RV32I-NEXT:    or a1, a3, a1
 ; RV32I-NEXT:    slli a2, a2, 31
+; RV32I-NEXT:    srli a3, a1, 1
+; RV32I-NEXT:    slli a4, a1, 31
 ; RV32I-NEXT:    srli a0, a0, 1
-; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    or a1, a3, a2
+; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_fixed_i64:
@@ -197,20 +197,20 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind {
 define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind {
 ; RV32I-LABEL: test_ext_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    add a4, a3, a1
+; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    sltu a1, a0, a2
-; RV32I-NEXT:    add a2, a4, a1
-; RV32I-NEXT:    beq a2, a3, .LBB7_2
+; RV32I-NEXT:    sltu a2, a0, a2
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    beq a1, a3, .LBB7_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu a1, a2, a3
+; RV32I-NEXT:    sltu a2, a1, a3
 ; RV32I-NEXT:  .LBB7_2:
-; RV32I-NEXT:    slli a1, a1, 31
-; RV32I-NEXT:    srli a3, a2, 1
-; RV32I-NEXT:    or a1, a3, a1
 ; RV32I-NEXT:    slli a2, a2, 31
+; RV32I-NEXT:    srli a3, a1, 1
+; RV32I-NEXT:    slli a4, a1, 31
 ; RV32I-NEXT:    srli a0, a0, 1
-; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    or a1, a3, a2
+; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_ext_i64:

diff  --git a/llvm/test/CodeGen/RISCV/bf16-promote.ll b/llvm/test/CodeGen/RISCV/bf16-promote.ll
index c17450a80de963..08c053fab4f676 100644
--- a/llvm/test/CodeGen/RISCV/bf16-promote.ll
+++ b/llvm/test/CodeGen/RISCV/bf16-promote.ll
@@ -110,13 +110,13 @@ define void @test_fadd(ptr %p, ptr %q) nounwind {
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
-; RV64-NEXT:    lhu a1, 0(a1)
 ; RV64-NEXT:    mv s0, a0
-; RV64-NEXT:    lhu a0, 0(a0)
-; RV64-NEXT:    slli a1, a1, 16
-; RV64-NEXT:    fmv.w.x fa5, a1
+; RV64-NEXT:    lhu a0, 0(a1)
+; RV64-NEXT:    lhu a1, 0(s0)
 ; RV64-NEXT:    slli a0, a0, 16
-; RV64-NEXT:    fmv.w.x fa4, a0
+; RV64-NEXT:    slli a1, a1, 16
+; RV64-NEXT:    fmv.w.x fa5, a0
+; RV64-NEXT:    fmv.w.x fa4, a1
 ; RV64-NEXT:    fadd.s fa0, fa4, fa5
 ; RV64-NEXT:    call __truncsfbf2
 ; RV64-NEXT:    fmv.x.w a0, fa0
@@ -131,13 +131,13 @@ define void @test_fadd(ptr %p, ptr %q) nounwind {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    lhu a1, 0(a1)
 ; RV32-NEXT:    mv s0, a0
-; RV32-NEXT:    lhu a0, 0(a0)
-; RV32-NEXT:    slli a1, a1, 16
-; RV32-NEXT:    fmv.w.x fa5, a1
+; RV32-NEXT:    lhu a0, 0(a1)
+; RV32-NEXT:    lhu a1, 0(s0)
 ; RV32-NEXT:    slli a0, a0, 16
-; RV32-NEXT:    fmv.w.x fa4, a0
+; RV32-NEXT:    slli a1, a1, 16
+; RV32-NEXT:    fmv.w.x fa5, a0
+; RV32-NEXT:    fmv.w.x fa4, a1
 ; RV32-NEXT:    fadd.s fa0, fa4, fa5
 ; RV32-NEXT:    call __truncsfbf2
 ; RV32-NEXT:    fmv.x.w a0, fa0

diff  --git a/llvm/test/CodeGen/RISCV/bfloat-arith.ll b/llvm/test/CodeGen/RISCV/bfloat-arith.ll
index b688af4234e65a..871b43e61df508 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-arith.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-arith.ll
@@ -102,11 +102,11 @@ define i32 @fneg_bf16(bfloat %a, bfloat %b) nounwind {
 ; CHECK-LABEL: fneg_bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
+; CHECK-NEXT:    lui a0, 1048568
 ; CHECK-NEXT:    fadd.s fa5, fa5, fa5
 ; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fmv.x.h a0, fa5
-; CHECK-NEXT:    lui a1, 1048568
-; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    fmv.x.h a1, fa5
+; CHECK-NEXT:    xor a0, a1, a0
 ; CHECK-NEXT:    fmv.h.x fa4, a0
 ; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
@@ -124,12 +124,12 @@ define bfloat @fsgnjn_bf16(bfloat %a, bfloat %b) nounwind {
 ; RV32IZFBFMIN:       # %bb.0:
 ; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
 ; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa0
+; RV32IZFBFMIN-NEXT:    lui a0, 1048568
 ; RV32IZFBFMIN-NEXT:    fadd.s fa5, fa4, fa5
 ; RV32IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
-; RV32IZFBFMIN-NEXT:    fmv.x.h a0, fa5
-; RV32IZFBFMIN-NEXT:    not a0, a0
-; RV32IZFBFMIN-NEXT:    lui a1, 1048568
-; RV32IZFBFMIN-NEXT:    and a0, a0, a1
+; RV32IZFBFMIN-NEXT:    fmv.x.h a1, fa5
+; RV32IZFBFMIN-NEXT:    not a1, a1
+; RV32IZFBFMIN-NEXT:    and a0, a1, a0
 ; RV32IZFBFMIN-NEXT:    fmv.x.h a1, fa0
 ; RV32IZFBFMIN-NEXT:    slli a1, a1, 17
 ; RV32IZFBFMIN-NEXT:    srli a1, a1, 17
@@ -141,12 +141,12 @@ define bfloat @fsgnjn_bf16(bfloat %a, bfloat %b) nounwind {
 ; RV64IZFBFMIN:       # %bb.0:
 ; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
 ; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa0
+; RV64IZFBFMIN-NEXT:    lui a0, 1048568
 ; RV64IZFBFMIN-NEXT:    fadd.s fa5, fa4, fa5
 ; RV64IZFBFMIN-NEXT:    fcvt.bf16.s fa5, fa5
-; RV64IZFBFMIN-NEXT:    fmv.x.h a0, fa5
-; RV64IZFBFMIN-NEXT:    not a0, a0
-; RV64IZFBFMIN-NEXT:    lui a1, 1048568
-; RV64IZFBFMIN-NEXT:    and a0, a0, a1
+; RV64IZFBFMIN-NEXT:    fmv.x.h a1, fa5
+; RV64IZFBFMIN-NEXT:    not a1, a1
+; RV64IZFBFMIN-NEXT:    and a0, a1, a0
 ; RV64IZFBFMIN-NEXT:    fmv.x.h a1, fa0
 ; RV64IZFBFMIN-NEXT:    slli a1, a1, 49
 ; RV64IZFBFMIN-NEXT:    srli a1, a1, 49
@@ -247,16 +247,16 @@ define bfloat @fmsub_bf16(bfloat %a, bfloat %b, bfloat %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa2
 ; CHECK-NEXT:    fmv.w.x fa4, zero
+; CHECK-NEXT:    lui a0, 1048568
+; CHECK-NEXT:    fcvt.s.bf16 fa3, fa1
 ; CHECK-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fmv.x.h a0, fa5
-; CHECK-NEXT:    lui a1, 1048568
-; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    fmv.x.h a1, fa5
+; CHECK-NEXT:    xor a0, a1, a0
 ; CHECK-NEXT:    fmv.h.x fa5, a0
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa1
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa0
-; CHECK-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; CHECK-NEXT:    fcvt.s.bf16 fa4, fa0
+; CHECK-NEXT:    fmadd.s fa5, fa4, fa3, fa5
 ; CHECK-NEXT:    fcvt.bf16.s fa0, fa5
 ; CHECK-NEXT:    ret
   %c_ = fadd bfloat 0.0, %c ; avoid negation using xor
@@ -270,17 +270,17 @@ define bfloat @fnmadd_bf16(bfloat %a, bfloat %b, bfloat %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
 ; CHECK-NEXT:    fmv.w.x fa4, zero
-; CHECK-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
 ; CHECK-NEXT:    fcvt.s.bf16 fa3, fa2
+; CHECK-NEXT:    lui a0, 1048568
+; CHECK-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-NEXT:    fadd.s fa4, fa3, fa4
+; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
 ; CHECK-NEXT:    fcvt.bf16.s fa4, fa4
-; CHECK-NEXT:    fmv.x.h a0, fa5
-; CHECK-NEXT:    lui a1, 1048568
-; CHECK-NEXT:    xor a0, a0, a1
-; CHECK-NEXT:    fmv.h.x fa5, a0
-; CHECK-NEXT:    fmv.x.h a0, fa4
-; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    fmv.x.h a1, fa5
+; CHECK-NEXT:    fmv.x.h a2, fa4
+; CHECK-NEXT:    xor a1, a1, a0
+; CHECK-NEXT:    xor a0, a2, a0
+; CHECK-NEXT:    fmv.h.x fa5, a1
 ; CHECK-NEXT:    fmv.h.x fa4, a0
 ; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
@@ -301,17 +301,17 @@ define bfloat @fnmadd_s_2(bfloat %a, bfloat %b, bfloat %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa1
 ; CHECK-NEXT:    fmv.w.x fa4, zero
-; CHECK-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
 ; CHECK-NEXT:    fcvt.s.bf16 fa3, fa2
+; CHECK-NEXT:    lui a0, 1048568
+; CHECK-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-NEXT:    fadd.s fa4, fa3, fa4
+; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
 ; CHECK-NEXT:    fcvt.bf16.s fa4, fa4
-; CHECK-NEXT:    fmv.x.h a0, fa5
-; CHECK-NEXT:    lui a1, 1048568
-; CHECK-NEXT:    xor a0, a0, a1
-; CHECK-NEXT:    fmv.h.x fa5, a0
-; CHECK-NEXT:    fmv.x.h a0, fa4
-; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    fmv.x.h a1, fa5
+; CHECK-NEXT:    fmv.x.h a2, fa4
+; CHECK-NEXT:    xor a1, a1, a0
+; CHECK-NEXT:    xor a0, a2, a0
+; CHECK-NEXT:    fmv.h.x fa5, a1
 ; CHECK-NEXT:    fmv.h.x fa4, a0
 ; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
@@ -369,16 +369,16 @@ define bfloat @fnmsub_bf16(bfloat %a, bfloat %b, bfloat %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
 ; CHECK-NEXT:    fmv.w.x fa4, zero
+; CHECK-NEXT:    lui a0, 1048568
+; CHECK-NEXT:    fcvt.s.bf16 fa3, fa2
 ; CHECK-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fmv.x.h a0, fa5
-; CHECK-NEXT:    lui a1, 1048568
-; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    fmv.x.h a1, fa5
+; CHECK-NEXT:    xor a0, a1, a0
 ; CHECK-NEXT:    fmv.h.x fa5, a0
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa2
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa1
-; CHECK-NEXT:    fmadd.s fa5, fa5, fa3, fa4
+; CHECK-NEXT:    fcvt.s.bf16 fa4, fa1
+; CHECK-NEXT:    fmadd.s fa5, fa5, fa4, fa3
 ; CHECK-NEXT:    fcvt.bf16.s fa0, fa5
 ; CHECK-NEXT:    ret
   %a_ = fadd bfloat 0.0, %a
@@ -392,16 +392,16 @@ define bfloat @fnmsub_bf16_2(bfloat %a, bfloat %b, bfloat %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa1
 ; CHECK-NEXT:    fmv.w.x fa4, zero
+; CHECK-NEXT:    lui a0, 1048568
+; CHECK-NEXT:    fcvt.s.bf16 fa3, fa2
 ; CHECK-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fmv.x.h a0, fa5
-; CHECK-NEXT:    lui a1, 1048568
-; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    fmv.x.h a1, fa5
+; CHECK-NEXT:    xor a0, a1, a0
 ; CHECK-NEXT:    fmv.h.x fa5, a0
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa2
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa0
-; CHECK-NEXT:    fmadd.s fa5, fa3, fa5, fa4
+; CHECK-NEXT:    fcvt.s.bf16 fa4, fa0
+; CHECK-NEXT:    fmadd.s fa5, fa4, fa5, fa3
 ; CHECK-NEXT:    fcvt.bf16.s fa0, fa5
 ; CHECK-NEXT:    ret
   %b_ = fadd bfloat 0.0, %b
@@ -432,11 +432,11 @@ define bfloat @fmsub_bf16_contract(bfloat %a, bfloat %b, bfloat %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa2
 ; CHECK-NEXT:    fmv.w.x fa4, zero
+; CHECK-NEXT:    fcvt.s.bf16 fa3, fa1
+; CHECK-NEXT:    fcvt.s.bf16 fa2, fa0
 ; CHECK-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-NEXT:    fmul.s fa4, fa2, fa3
 ; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa4, fa1
-; CHECK-NEXT:    fcvt.s.bf16 fa3, fa0
-; CHECK-NEXT:    fmul.s fa4, fa3, fa4
 ; CHECK-NEXT:    fcvt.bf16.s fa4, fa4
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
 ; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
@@ -454,21 +454,21 @@ define bfloat @fnmadd_bf16_contract(bfloat %a, bfloat %b, bfloat %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
 ; CHECK-NEXT:    fmv.w.x fa4, zero
-; CHECK-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
 ; CHECK-NEXT:    fcvt.s.bf16 fa3, fa1
-; CHECK-NEXT:    fadd.s fa3, fa3, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa3, fa3
 ; CHECK-NEXT:    fcvt.s.bf16 fa2, fa2
+; CHECK-NEXT:    lui a0, 1048568
+; CHECK-NEXT:    fadd.s fa5, fa5, fa4
+; CHECK-NEXT:    fadd.s fa3, fa3, fa4
 ; CHECK-NEXT:    fadd.s fa4, fa2, fa4
+; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
+; CHECK-NEXT:    fcvt.bf16.s fa3, fa3
 ; CHECK-NEXT:    fcvt.bf16.s fa4, fa4
 ; CHECK-NEXT:    fcvt.s.bf16 fa3, fa3
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
 ; CHECK-NEXT:    fmul.s fa5, fa5, fa3
 ; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
-; CHECK-NEXT:    fmv.x.h a0, fa5
-; CHECK-NEXT:    lui a1, 1048568
-; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    fmv.x.h a1, fa5
+; CHECK-NEXT:    xor a0, a1, a0
 ; CHECK-NEXT:    fmv.h.x fa5, a0
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5
 ; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
@@ -489,10 +489,10 @@ define bfloat @fnmsub_bf16_contract(bfloat %a, bfloat %b, bfloat %c) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
 ; CHECK-NEXT:    fmv.w.x fa4, zero
-; CHECK-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
 ; CHECK-NEXT:    fcvt.s.bf16 fa3, fa1
+; CHECK-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-NEXT:    fadd.s fa4, fa3, fa4
+; CHECK-NEXT:    fcvt.bf16.s fa5, fa5
 ; CHECK-NEXT:    fcvt.bf16.s fa4, fa4
 ; CHECK-NEXT:    fcvt.s.bf16 fa4, fa4
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa5

diff  --git a/llvm/test/CodeGen/RISCV/bfloat-br-fcmp.ll b/llvm/test/CodeGen/RISCV/bfloat-br-fcmp.ll
index 243c7d463661a9..51ea8873d8c031 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-br-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-br-fcmp.ll
@@ -292,9 +292,9 @@ define void @br_fcmp_ord(bfloat %a, bfloat %b) nounwind {
 ; RV32IZFBFMIN-LABEL: br_fcmp_ord:
 ; RV32IZFBFMIN:       # %bb.0:
 ; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa0
 ; RV32IZFBFMIN-NEXT:    feq.s a0, fa5, fa5
-; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
-; RV32IZFBFMIN-NEXT:    feq.s a1, fa5, fa5
+; RV32IZFBFMIN-NEXT:    feq.s a1, fa4, fa4
 ; RV32IZFBFMIN-NEXT:    and a0, a1, a0
 ; RV32IZFBFMIN-NEXT:    bnez a0, .LBB8_2
 ; RV32IZFBFMIN-NEXT:  # %bb.1: # %if.else
@@ -307,9 +307,9 @@ define void @br_fcmp_ord(bfloat %a, bfloat %b) nounwind {
 ; RV64IZFBFMIN-LABEL: br_fcmp_ord:
 ; RV64IZFBFMIN:       # %bb.0:
 ; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa0
 ; RV64IZFBFMIN-NEXT:    feq.s a0, fa5, fa5
-; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
-; RV64IZFBFMIN-NEXT:    feq.s a1, fa5, fa5
+; RV64IZFBFMIN-NEXT:    feq.s a1, fa4, fa4
 ; RV64IZFBFMIN-NEXT:    and a0, a1, a0
 ; RV64IZFBFMIN-NEXT:    bnez a0, .LBB8_2
 ; RV64IZFBFMIN-NEXT:  # %bb.1: # %if.else
@@ -545,9 +545,9 @@ define void @br_fcmp_uno(bfloat %a, bfloat %b) nounwind {
 ; RV32IZFBFMIN-LABEL: br_fcmp_uno:
 ; RV32IZFBFMIN:       # %bb.0:
 ; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
+; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa0
 ; RV32IZFBFMIN-NEXT:    feq.s a0, fa5, fa5
-; RV32IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
-; RV32IZFBFMIN-NEXT:    feq.s a1, fa5, fa5
+; RV32IZFBFMIN-NEXT:    feq.s a1, fa4, fa4
 ; RV32IZFBFMIN-NEXT:    and a0, a1, a0
 ; RV32IZFBFMIN-NEXT:    beqz a0, .LBB15_2
 ; RV32IZFBFMIN-NEXT:  # %bb.1: # %if.else
@@ -560,9 +560,9 @@ define void @br_fcmp_uno(bfloat %a, bfloat %b) nounwind {
 ; RV64IZFBFMIN-LABEL: br_fcmp_uno:
 ; RV64IZFBFMIN:       # %bb.0:
 ; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa1
+; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa4, fa0
 ; RV64IZFBFMIN-NEXT:    feq.s a0, fa5, fa5
-; RV64IZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
-; RV64IZFBFMIN-NEXT:    feq.s a1, fa5, fa5
+; RV64IZFBFMIN-NEXT:    feq.s a1, fa4, fa4
 ; RV64IZFBFMIN-NEXT:    and a0, a1, a0
 ; RV64IZFBFMIN-NEXT:    beqz a0, .LBB15_2
 ; RV64IZFBFMIN-NEXT:  # %bb.1: # %if.else

diff  --git a/llvm/test/CodeGen/RISCV/bfloat-convert.ll b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
index c2c21a30d4e4c5..82359769c7c22f 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-convert.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
@@ -51,13 +51,13 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind {
 ; CHECK32ZFBFMIN-LABEL: fcvt_si_bf16_sat:
 ; CHECK32ZFBFMIN:       # %bb.0: # %start
 ; CHECK32ZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK32ZFBFMIN-NEXT:    feq.s a0, fa5, fa5
-; CHECK32ZFBFMIN-NEXT:    neg a0, a0
-; CHECK32ZFBFMIN-NEXT:    lui a1, %hi(.LCPI1_0)
-; CHECK32ZFBFMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; CHECK32ZFBFMIN-NEXT:    lui a1, 815104
-; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa3, a1
+; CHECK32ZFBFMIN-NEXT:    lui a0, %hi(.LCPI1_0)
+; CHECK32ZFBFMIN-NEXT:    feq.s a1, fa5, fa5
+; CHECK32ZFBFMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; CHECK32ZFBFMIN-NEXT:    lui a0, 815104
+; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa3, a0
 ; CHECK32ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa3
+; CHECK32ZFBFMIN-NEXT:    neg a0, a1
 ; CHECK32ZFBFMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32ZFBFMIN-NEXT:    fcvt.w.s a1, fa5, rtz
 ; CHECK32ZFBFMIN-NEXT:    and a0, a0, a1
@@ -66,14 +66,14 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind {
 ; RV32ID-LABEL: fcvt_si_bf16_sat:
 ; RV32ID:       # %bb.0: # %start
 ; RV32ID-NEXT:    fmv.x.w a0, fa0
-; RV32ID-NEXT:    slli a0, a0, 16
-; RV32ID-NEXT:    fmv.w.x fa5, a0
-; RV32ID-NEXT:    feq.s a0, fa5, fa5
+; RV32ID-NEXT:    lui a1, 815104
+; RV32ID-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-NEXT:    lui a1, %hi(.LCPI1_0)
+; RV32ID-NEXT:    slli a0, a0, 16
 ; RV32ID-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; RV32ID-NEXT:    lui a1, 815104
-; RV32ID-NEXT:    fmv.w.x fa3, a1
-; RV32ID-NEXT:    fmax.s fa5, fa5, fa3
+; RV32ID-NEXT:    fmv.w.x fa3, a0
+; RV32ID-NEXT:    feq.s a0, fa3, fa3
+; RV32ID-NEXT:    fmax.s fa5, fa3, fa5
 ; RV32ID-NEXT:    neg a0, a0
 ; RV32ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-NEXT:    fcvt.w.s a1, fa5, rtz
@@ -83,13 +83,13 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind {
 ; CHECK64ZFBFMIN-LABEL: fcvt_si_bf16_sat:
 ; CHECK64ZFBFMIN:       # %bb.0: # %start
 ; CHECK64ZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK64ZFBFMIN-NEXT:    feq.s a0, fa5, fa5
-; CHECK64ZFBFMIN-NEXT:    neg a0, a0
-; CHECK64ZFBFMIN-NEXT:    lui a1, %hi(.LCPI1_0)
-; CHECK64ZFBFMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; CHECK64ZFBFMIN-NEXT:    lui a1, 815104
-; CHECK64ZFBFMIN-NEXT:    fmv.w.x fa3, a1
+; CHECK64ZFBFMIN-NEXT:    lui a0, %hi(.LCPI1_0)
+; CHECK64ZFBFMIN-NEXT:    feq.s a1, fa5, fa5
+; CHECK64ZFBFMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; CHECK64ZFBFMIN-NEXT:    lui a0, 815104
+; CHECK64ZFBFMIN-NEXT:    fmv.w.x fa3, a0
 ; CHECK64ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa3
+; CHECK64ZFBFMIN-NEXT:    neg a0, a1
 ; CHECK64ZFBFMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64ZFBFMIN-NEXT:    fcvt.l.s a1, fa5, rtz
 ; CHECK64ZFBFMIN-NEXT:    and a0, a0, a1
@@ -98,14 +98,14 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_si_bf16_sat:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 16
-; RV64ID-NEXT:    fmv.w.x fa5, a0
-; RV64ID-NEXT:    feq.s a0, fa5, fa5
+; RV64ID-NEXT:    lui a1, 815104
+; RV64ID-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-NEXT:    lui a1, %hi(.LCPI1_0)
+; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; RV64ID-NEXT:    lui a1, 815104
-; RV64ID-NEXT:    fmv.w.x fa3, a1
-; RV64ID-NEXT:    fmax.s fa5, fa5, fa3
+; RV64ID-NEXT:    fmv.w.x fa3, a0
+; RV64ID-NEXT:    feq.s a0, fa3, fa3
+; RV64ID-NEXT:    fmax.s fa5, fa3, fa5
 ; RV64ID-NEXT:    neg a0, a0
 ; RV64ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-NEXT:    fcvt.l.s a1, fa5, rtz
@@ -163,10 +163,10 @@ define i16 @fcvt_ui_bf16_sat(bfloat %a) nounwind {
 ;
 ; RV32ID-LABEL: fcvt_ui_bf16_sat:
 ; RV32ID:       # %bb.0: # %start
+; RV32ID-NEXT:    lui a0, %hi(.LCPI3_0)
+; RV32ID-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
 ; RV32ID-NEXT:    fmv.x.w a0, fa0
 ; RV32ID-NEXT:    slli a0, a0, 16
-; RV32ID-NEXT:    lui a1, %hi(.LCPI3_0)
-; RV32ID-NEXT:    flw fa5, %lo(.LCPI3_0)(a1)
 ; RV32ID-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-NEXT:    fmv.w.x fa3, zero
 ; RV32ID-NEXT:    fmax.s fa4, fa4, fa3
@@ -187,10 +187,10 @@ define i16 @fcvt_ui_bf16_sat(bfloat %a) nounwind {
 ;
 ; RV64ID-LABEL: fcvt_ui_bf16_sat:
 ; RV64ID:       # %bb.0: # %start
+; RV64ID-NEXT:    lui a0, %hi(.LCPI3_0)
+; RV64ID-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
 ; RV64ID-NEXT:    slli a0, a0, 16
-; RV64ID-NEXT:    lui a1, %hi(.LCPI3_0)
-; RV64ID-NEXT:    flw fa5, %lo(.LCPI3_0)(a1)
 ; RV64ID-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-NEXT:    fmv.w.x fa3, zero
 ; RV64ID-NEXT:    fmax.s fa4, fa4, fa3
@@ -466,7 +466,7 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind {
 ; RV32IZFBFMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFBFMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFBFMIN-NEXT:    call __fixsfdi
-; RV32IZFBFMIN-NEXT:    lui a4, 524288
+; RV32IZFBFMIN-NEXT:    lui a3, 524288
 ; RV32IZFBFMIN-NEXT:    lui a2, 524288
 ; RV32IZFBFMIN-NEXT:    beqz s0, .LBB10_2
 ; RV32IZFBFMIN-NEXT:  # %bb.1: # %start
@@ -474,19 +474,19 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind {
 ; RV32IZFBFMIN-NEXT:  .LBB10_2: # %start
 ; RV32IZFBFMIN-NEXT:    lui a1, %hi(.LCPI10_0)
 ; RV32IZFBFMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
-; RV32IZFBFMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFBFMIN-NEXT:    beqz a3, .LBB10_4
+; RV32IZFBFMIN-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFBFMIN-NEXT:    beqz a1, .LBB10_4
 ; RV32IZFBFMIN-NEXT:  # %bb.3:
-; RV32IZFBFMIN-NEXT:    addi a2, a4, -1
+; RV32IZFBFMIN-NEXT:    addi a2, a3, -1
 ; RV32IZFBFMIN-NEXT:  .LBB10_4: # %start
-; RV32IZFBFMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFBFMIN-NEXT:    feq.s a3, fs0, fs0
 ; RV32IZFBFMIN-NEXT:    neg a4, a1
-; RV32IZFBFMIN-NEXT:    and a1, a4, a2
-; RV32IZFBFMIN-NEXT:    neg a2, a3
-; RV32IZFBFMIN-NEXT:    neg a3, s0
+; RV32IZFBFMIN-NEXT:    neg a1, s0
+; RV32IZFBFMIN-NEXT:    neg a3, a3
+; RV32IZFBFMIN-NEXT:    and a0, a1, a0
+; RV32IZFBFMIN-NEXT:    and a1, a3, a2
+; RV32IZFBFMIN-NEXT:    or a0, a4, a0
 ; RV32IZFBFMIN-NEXT:    and a0, a3, a0
-; RV32IZFBFMIN-NEXT:    or a0, a2, a0
-; RV32IZFBFMIN-NEXT:    and a0, a4, a0
 ; RV32IZFBFMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFBFMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFBFMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -505,7 +505,7 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind {
 ; R32IDZFBFMIN-NEXT:    fle.s s0, fa5, fs0
 ; R32IDZFBFMIN-NEXT:    fmv.s fa0, fs0
 ; R32IDZFBFMIN-NEXT:    call __fixsfdi
-; R32IDZFBFMIN-NEXT:    lui a4, 524288
+; R32IDZFBFMIN-NEXT:    lui a3, 524288
 ; R32IDZFBFMIN-NEXT:    lui a2, 524288
 ; R32IDZFBFMIN-NEXT:    beqz s0, .LBB10_2
 ; R32IDZFBFMIN-NEXT:  # %bb.1: # %start
@@ -513,19 +513,19 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind {
 ; R32IDZFBFMIN-NEXT:  .LBB10_2: # %start
 ; R32IDZFBFMIN-NEXT:    lui a1, %hi(.LCPI10_0)
 ; R32IDZFBFMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
-; R32IDZFBFMIN-NEXT:    flt.s a3, fa5, fs0
-; R32IDZFBFMIN-NEXT:    beqz a3, .LBB10_4
+; R32IDZFBFMIN-NEXT:    flt.s a1, fa5, fs0
+; R32IDZFBFMIN-NEXT:    beqz a1, .LBB10_4
 ; R32IDZFBFMIN-NEXT:  # %bb.3:
-; R32IDZFBFMIN-NEXT:    addi a2, a4, -1
+; R32IDZFBFMIN-NEXT:    addi a2, a3, -1
 ; R32IDZFBFMIN-NEXT:  .LBB10_4: # %start
-; R32IDZFBFMIN-NEXT:    feq.s a1, fs0, fs0
+; R32IDZFBFMIN-NEXT:    feq.s a3, fs0, fs0
 ; R32IDZFBFMIN-NEXT:    neg a4, a1
-; R32IDZFBFMIN-NEXT:    and a1, a4, a2
-; R32IDZFBFMIN-NEXT:    neg a2, a3
-; R32IDZFBFMIN-NEXT:    neg a3, s0
+; R32IDZFBFMIN-NEXT:    neg a1, s0
+; R32IDZFBFMIN-NEXT:    neg a3, a3
+; R32IDZFBFMIN-NEXT:    and a0, a1, a0
+; R32IDZFBFMIN-NEXT:    and a1, a3, a2
+; R32IDZFBFMIN-NEXT:    or a0, a4, a0
 ; R32IDZFBFMIN-NEXT:    and a0, a3, a0
-; R32IDZFBFMIN-NEXT:    or a0, a2, a0
-; R32IDZFBFMIN-NEXT:    and a0, a4, a0
 ; R32IDZFBFMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; R32IDZFBFMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; R32IDZFBFMIN-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -546,7 +546,7 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind {
 ; RV32ID-NEXT:    fle.s s0, fa5, fs0
 ; RV32ID-NEXT:    fmv.s fa0, fs0
 ; RV32ID-NEXT:    call __fixsfdi
-; RV32ID-NEXT:    lui a4, 524288
+; RV32ID-NEXT:    lui a3, 524288
 ; RV32ID-NEXT:    lui a2, 524288
 ; RV32ID-NEXT:    beqz s0, .LBB10_2
 ; RV32ID-NEXT:  # %bb.1: # %start
@@ -554,19 +554,19 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind {
 ; RV32ID-NEXT:  .LBB10_2: # %start
 ; RV32ID-NEXT:    lui a1, %hi(.LCPI10_0)
 ; RV32ID-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
-; RV32ID-NEXT:    flt.s a3, fa5, fs0
-; RV32ID-NEXT:    beqz a3, .LBB10_4
+; RV32ID-NEXT:    flt.s a1, fa5, fs0
+; RV32ID-NEXT:    beqz a1, .LBB10_4
 ; RV32ID-NEXT:  # %bb.3:
-; RV32ID-NEXT:    addi a2, a4, -1
+; RV32ID-NEXT:    addi a2, a3, -1
 ; RV32ID-NEXT:  .LBB10_4: # %start
-; RV32ID-NEXT:    feq.s a1, fs0, fs0
+; RV32ID-NEXT:    feq.s a3, fs0, fs0
 ; RV32ID-NEXT:    neg a4, a1
-; RV32ID-NEXT:    and a1, a4, a2
-; RV32ID-NEXT:    neg a2, a3
-; RV32ID-NEXT:    neg a3, s0
+; RV32ID-NEXT:    neg a1, s0
+; RV32ID-NEXT:    neg a3, a3
+; RV32ID-NEXT:    and a0, a1, a0
+; RV32ID-NEXT:    and a1, a3, a2
+; RV32ID-NEXT:    or a0, a4, a0
 ; RV32ID-NEXT:    and a0, a3, a0
-; RV32ID-NEXT:    or a0, a2, a0
-; RV32ID-NEXT:    and a0, a4, a0
 ; RV32ID-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -650,15 +650,15 @@ define i64 @fcvt_lu_bf16_sat(bfloat %a) nounwind {
 ; CHECK32ZFBFMIN-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK32ZFBFMIN-NEXT:    flw fa5, %lo(.LCPI12_0)(a0)
 ; CHECK32ZFBFMIN-NEXT:    fcvt.s.bf16 fa0, fa0
-; CHECK32ZFBFMIN-NEXT:    flt.s a0, fa5, fa0
-; CHECK32ZFBFMIN-NEXT:    neg s0, a0
-; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa5, zero
-; CHECK32ZFBFMIN-NEXT:    fle.s a0, fa5, fa0
+; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa4, zero
+; CHECK32ZFBFMIN-NEXT:    fle.s a0, fa4, fa0
+; CHECK32ZFBFMIN-NEXT:    flt.s a1, fa5, fa0
+; CHECK32ZFBFMIN-NEXT:    neg s0, a1
 ; CHECK32ZFBFMIN-NEXT:    neg s1, a0
 ; CHECK32ZFBFMIN-NEXT:    call __fixunssfdi
 ; CHECK32ZFBFMIN-NEXT:    and a0, s1, a0
-; CHECK32ZFBFMIN-NEXT:    or a0, s0, a0
 ; CHECK32ZFBFMIN-NEXT:    and a1, s1, a1
+; CHECK32ZFBFMIN-NEXT:    or a0, s0, a0
 ; CHECK32ZFBFMIN-NEXT:    or a1, s0, a1
 ; CHECK32ZFBFMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK32ZFBFMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -672,20 +672,20 @@ define i64 @fcvt_lu_bf16_sat(bfloat %a) nounwind {
 ; RV32ID-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32ID-NEXT:    lui a0, %hi(.LCPI12_0)
-; RV32ID-NEXT:    flw fa5, %lo(.LCPI12_0)(a0)
 ; RV32ID-NEXT:    fmv.x.w a0, fa0
+; RV32ID-NEXT:    lui a1, %hi(.LCPI12_0)
+; RV32ID-NEXT:    fmv.w.x fa5, zero
+; RV32ID-NEXT:    flw fa4, %lo(.LCPI12_0)(a1)
 ; RV32ID-NEXT:    slli a0, a0, 16
 ; RV32ID-NEXT:    fmv.w.x fa0, a0
-; RV32ID-NEXT:    flt.s a0, fa5, fa0
-; RV32ID-NEXT:    neg s0, a0
-; RV32ID-NEXT:    fmv.w.x fa5, zero
 ; RV32ID-NEXT:    fle.s a0, fa5, fa0
+; RV32ID-NEXT:    flt.s a1, fa4, fa0
+; RV32ID-NEXT:    neg s0, a1
 ; RV32ID-NEXT:    neg s1, a0
 ; RV32ID-NEXT:    call __fixunssfdi
 ; RV32ID-NEXT:    and a0, s1, a0
-; RV32ID-NEXT:    or a0, s0, a0
 ; RV32ID-NEXT:    and a1, s1, a1
+; RV32ID-NEXT:    or a0, s0, a0
 ; RV32ID-NEXT:    or a1, s0, a1
 ; RV32ID-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1251,7 +1251,7 @@ define double @fcvt_d_bf16(bfloat %a) nounwind {
 ;
 ; R32IDZFBFMIN-LABEL: fcvt_d_bf16:
 ; R32IDZFBFMIN:       # %bb.0:
-; R32IDZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; R32IDZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0, dyn
 ; R32IDZFBFMIN-NEXT:    fcvt.d.s fa0, fa5
 ; R32IDZFBFMIN-NEXT:    ret
 ;
@@ -1275,7 +1275,7 @@ define double @fcvt_d_bf16(bfloat %a) nounwind {
 ;
 ; RV64IDZFBFMIN-LABEL: fcvt_d_bf16:
 ; RV64IDZFBFMIN:       # %bb.0:
-; RV64IDZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; RV64IDZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0, dyn
 ; RV64IDZFBFMIN-NEXT:    fcvt.d.s fa0, fa5
 ; RV64IDZFBFMIN-NEXT:    ret
 ;
@@ -1498,12 +1498,12 @@ define signext i8 @fcvt_w_s_sat_i8(bfloat %a) nounwind {
 ; CHECK32ZFBFMIN-LABEL: fcvt_w_s_sat_i8:
 ; CHECK32ZFBFMIN:       # %bb.0: # %start
 ; CHECK32ZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; CHECK32ZFBFMIN-NEXT:    lui a0, 798720
+; CHECK32ZFBFMIN-NEXT:    lui a1, 274400
+; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa4, a0
 ; CHECK32ZFBFMIN-NEXT:    feq.s a0, fa5, fa5
 ; CHECK32ZFBFMIN-NEXT:    neg a0, a0
-; CHECK32ZFBFMIN-NEXT:    lui a1, 798720
-; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa4, a1
 ; CHECK32ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa4
-; CHECK32ZFBFMIN-NEXT:    lui a1, 274400
 ; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa4, a1
 ; CHECK32ZFBFMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32ZFBFMIN-NEXT:    fcvt.w.s a1, fa5, rtz
@@ -1513,15 +1513,15 @@ define signext i8 @fcvt_w_s_sat_i8(bfloat %a) nounwind {
 ; RV32ID-LABEL: fcvt_w_s_sat_i8:
 ; RV32ID:       # %bb.0: # %start
 ; RV32ID-NEXT:    fmv.x.w a0, fa0
-; RV32ID-NEXT:    slli a0, a0, 16
-; RV32ID-NEXT:    fmv.w.x fa5, a0
-; RV32ID-NEXT:    feq.s a0, fa5, fa5
-; RV32ID-NEXT:    neg a0, a0
 ; RV32ID-NEXT:    lui a1, 798720
-; RV32ID-NEXT:    fmv.w.x fa4, a1
-; RV32ID-NEXT:    fmax.s fa5, fa5, fa4
+; RV32ID-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-NEXT:    lui a1, 274400
+; RV32ID-NEXT:    slli a0, a0, 16
+; RV32ID-NEXT:    fmv.w.x fa4, a0
+; RV32ID-NEXT:    feq.s a0, fa4, fa4
+; RV32ID-NEXT:    fmax.s fa5, fa4, fa5
 ; RV32ID-NEXT:    fmv.w.x fa4, a1
+; RV32ID-NEXT:    neg a0, a0
 ; RV32ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32ID-NEXT:    and a0, a0, a1
@@ -1530,12 +1530,12 @@ define signext i8 @fcvt_w_s_sat_i8(bfloat %a) nounwind {
 ; CHECK64ZFBFMIN-LABEL: fcvt_w_s_sat_i8:
 ; CHECK64ZFBFMIN:       # %bb.0: # %start
 ; CHECK64ZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; CHECK64ZFBFMIN-NEXT:    lui a0, 798720
+; CHECK64ZFBFMIN-NEXT:    lui a1, 274400
+; CHECK64ZFBFMIN-NEXT:    fmv.w.x fa4, a0
 ; CHECK64ZFBFMIN-NEXT:    feq.s a0, fa5, fa5
 ; CHECK64ZFBFMIN-NEXT:    neg a0, a0
-; CHECK64ZFBFMIN-NEXT:    lui a1, 798720
-; CHECK64ZFBFMIN-NEXT:    fmv.w.x fa4, a1
 ; CHECK64ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa4
-; CHECK64ZFBFMIN-NEXT:    lui a1, 274400
 ; CHECK64ZFBFMIN-NEXT:    fmv.w.x fa4, a1
 ; CHECK64ZFBFMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64ZFBFMIN-NEXT:    fcvt.l.s a1, fa5, rtz
@@ -1545,15 +1545,15 @@ define signext i8 @fcvt_w_s_sat_i8(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_w_s_sat_i8:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 16
-; RV64ID-NEXT:    fmv.w.x fa5, a0
-; RV64ID-NEXT:    feq.s a0, fa5, fa5
-; RV64ID-NEXT:    neg a0, a0
 ; RV64ID-NEXT:    lui a1, 798720
-; RV64ID-NEXT:    fmv.w.x fa4, a1
-; RV64ID-NEXT:    fmax.s fa5, fa5, fa4
+; RV64ID-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-NEXT:    lui a1, 274400
+; RV64ID-NEXT:    slli a0, a0, 16
+; RV64ID-NEXT:    fmv.w.x fa4, a0
+; RV64ID-NEXT:    feq.s a0, fa4, fa4
+; RV64ID-NEXT:    fmax.s fa5, fa4, fa5
 ; RV64ID-NEXT:    fmv.w.x fa4, a1
+; RV64ID-NEXT:    neg a0, a0
 ; RV64ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64ID-NEXT:    and a0, a0, a1
@@ -1601,8 +1601,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(bfloat %a) nounwind {
 ; CHECK32ZFBFMIN:       # %bb.0: # %start
 ; CHECK32ZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
 ; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK32ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa4
 ; CHECK32ZFBFMIN-NEXT:    lui a0, 276464
+; CHECK32ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa4
 ; CHECK32ZFBFMIN-NEXT:    fmv.w.x fa4, a0
 ; CHECK32ZFBFMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32ZFBFMIN-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -1611,11 +1611,11 @@ define zeroext i8 @fcvt_wu_s_sat_i8(bfloat %a) nounwind {
 ; RV32ID-LABEL: fcvt_wu_s_sat_i8:
 ; RV32ID:       # %bb.0: # %start
 ; RV32ID-NEXT:    fmv.x.w a0, fa0
+; RV32ID-NEXT:    fmv.w.x fa5, zero
 ; RV32ID-NEXT:    slli a0, a0, 16
-; RV32ID-NEXT:    fmv.w.x fa5, a0
-; RV32ID-NEXT:    fmv.w.x fa4, zero
-; RV32ID-NEXT:    fmax.s fa5, fa5, fa4
+; RV32ID-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-NEXT:    lui a0, 276464
+; RV32ID-NEXT:    fmax.s fa5, fa4, fa5
 ; RV32ID-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -1625,8 +1625,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(bfloat %a) nounwind {
 ; CHECK64ZFBFMIN:       # %bb.0: # %start
 ; CHECK64ZFBFMIN-NEXT:    fcvt.s.bf16 fa5, fa0
 ; CHECK64ZFBFMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK64ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa4
 ; CHECK64ZFBFMIN-NEXT:    lui a0, 276464
+; CHECK64ZFBFMIN-NEXT:    fmax.s fa5, fa5, fa4
 ; CHECK64ZFBFMIN-NEXT:    fmv.w.x fa4, a0
 ; CHECK64ZFBFMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64ZFBFMIN-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -1635,11 +1635,11 @@ define zeroext i8 @fcvt_wu_s_sat_i8(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_wu_s_sat_i8:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
+; RV64ID-NEXT:    fmv.w.x fa5, zero
 ; RV64ID-NEXT:    slli a0, a0, 16
-; RV64ID-NEXT:    fmv.w.x fa5, a0
-; RV64ID-NEXT:    fmv.w.x fa4, zero
-; RV64ID-NEXT:    fmax.s fa5, fa5, fa4
+; RV64ID-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-NEXT:    lui a0, 276464
+; RV64ID-NEXT:    fmax.s fa5, fa4, fa5
 ; RV64ID-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz

diff  --git a/llvm/test/CodeGen/RISCV/bfloat-fcmp.ll b/llvm/test/CodeGen/RISCV/bfloat-fcmp.ll
index 9d5ba73de191d5..cc572ce489f62c 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-fcmp.ll
@@ -92,9 +92,9 @@ define i32 @fcmp_ord(bfloat %a, bfloat %b) nounwind {
 ; CHECK-LABEL: fcmp_ord:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa1
+; CHECK-NEXT:    fcvt.s.bf16 fa4, fa0
 ; CHECK-NEXT:    feq.s a0, fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    feq.s a1, fa5, fa5
+; CHECK-NEXT:    feq.s a1, fa4, fa4
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    ret
   %1 = fcmp ord bfloat %a, %b
@@ -186,9 +186,9 @@ define i32 @fcmp_uno(bfloat %a, bfloat %b) nounwind {
 ; CHECK-LABEL: fcmp_uno:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa1
+; CHECK-NEXT:    fcvt.s.bf16 fa4, fa0
 ; CHECK-NEXT:    feq.s a0, fa5, fa5
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    feq.s a1, fa5, fa5
+; CHECK-NEXT:    feq.s a1, fa4, fa4
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    xori a0, a0, 1
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/bfloat-mem.ll b/llvm/test/CodeGen/RISCV/bfloat-mem.ll
index a9ef261bb93024..f9cf4e523b77d4 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-mem.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-mem.ll
@@ -50,10 +50,10 @@ define bfloat @flh_fsh_global(bfloat %a, bfloat %b) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fcvt.s.bf16 fa5, fa1
 ; CHECK-NEXT:    fcvt.s.bf16 fa4, fa0
+; CHECK-NEXT:    lui a0, %hi(G)
 ; CHECK-NEXT:    fadd.s fa5, fa4, fa5
+; CHECK-NEXT:    flh fa4, %lo(G)(a0)
 ; CHECK-NEXT:    fcvt.bf16.s fa0, fa5
-; CHECK-NEXT:    lui a0, %hi(G)
-; CHECK-NEXT:    flh fa5, %lo(G)(a0)
 ; CHECK-NEXT:    addi a1, a0, %lo(G)
 ; CHECK-NEXT:    fsh fa0, %lo(G)(a0)
 ; CHECK-NEXT:    flh fa5, 18(a1)

diff  --git a/llvm/test/CodeGen/RISCV/bfloat.ll b/llvm/test/CodeGen/RISCV/bfloat.ll
index 9dc8ce6be1ea61..c83b0ed6b0eee3 100644
--- a/llvm/test/CodeGen/RISCV/bfloat.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat.ll
@@ -342,8 +342,8 @@ define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind {
 ; RV32ID-ILP32-NEXT:    addi sp, sp, -16
 ; RV32ID-ILP32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    slli a1, a1, 16
-; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32-NEXT:    slli a0, a0, 16
+; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-ILP32-NEXT:    fadd.s fa5, fa4, fa5
 ; RV32ID-ILP32-NEXT:    fmv.x.w a0, fa5
@@ -359,8 +359,8 @@ define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind {
 ; RV64ID-LP64-NEXT:    addi sp, sp, -16
 ; RV64ID-LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-LP64-NEXT:    slli a1, a1, 16
-; RV64ID-LP64-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64-NEXT:    slli a0, a0, 16
+; RV64ID-LP64-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-LP64-NEXT:    fadd.s fa5, fa4, fa5
 ; RV64ID-LP64-NEXT:    fmv.x.w a0, fa5
@@ -378,8 +378,8 @@ define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind {
 ; RV32ID-ILP32D-NEXT:    fmv.x.w a0, fa0
 ; RV32ID-ILP32D-NEXT:    fmv.x.w a1, fa1
 ; RV32ID-ILP32D-NEXT:    slli a1, a1, 16
-; RV32ID-ILP32D-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32D-NEXT:    slli a0, a0, 16
+; RV32ID-ILP32D-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32D-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-ILP32D-NEXT:    fadd.s fa0, fa4, fa5
 ; RV32ID-ILP32D-NEXT:    call __truncsfbf2
@@ -398,8 +398,8 @@ define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind {
 ; RV64ID-LP64D-NEXT:    fmv.x.w a0, fa0
 ; RV64ID-LP64D-NEXT:    fmv.x.w a1, fa1
 ; RV64ID-LP64D-NEXT:    slli a1, a1, 16
-; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64D-NEXT:    slli a0, a0, 16
+; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64D-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-LP64D-NEXT:    fadd.s fa0, fa4, fa5
 ; RV64ID-LP64D-NEXT:    call __truncsfbf2
@@ -450,8 +450,8 @@ define bfloat @bfloat_load(ptr %a) nounwind {
 ; RV32ID-ILP32-NEXT:    lhu a1, 6(a0)
 ; RV32ID-ILP32-NEXT:    lhu a0, 0(a0)
 ; RV32ID-ILP32-NEXT:    slli a1, a1, 16
-; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32-NEXT:    slli a0, a0, 16
+; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-ILP32-NEXT:    fadd.s fa5, fa4, fa5
 ; RV32ID-ILP32-NEXT:    fmv.x.w a0, fa5
@@ -469,8 +469,8 @@ define bfloat @bfloat_load(ptr %a) nounwind {
 ; RV64ID-LP64-NEXT:    lhu a1, 6(a0)
 ; RV64ID-LP64-NEXT:    lhu a0, 0(a0)
 ; RV64ID-LP64-NEXT:    slli a1, a1, 16
-; RV64ID-LP64-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64-NEXT:    slli a0, a0, 16
+; RV64ID-LP64-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-LP64-NEXT:    fadd.s fa5, fa4, fa5
 ; RV64ID-LP64-NEXT:    fmv.x.w a0, fa5
@@ -488,8 +488,8 @@ define bfloat @bfloat_load(ptr %a) nounwind {
 ; RV32ID-ILP32D-NEXT:    lhu a1, 6(a0)
 ; RV32ID-ILP32D-NEXT:    lhu a0, 0(a0)
 ; RV32ID-ILP32D-NEXT:    slli a1, a1, 16
-; RV32ID-ILP32D-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32D-NEXT:    slli a0, a0, 16
+; RV32ID-ILP32D-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32D-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-ILP32D-NEXT:    fadd.s fa0, fa4, fa5
 ; RV32ID-ILP32D-NEXT:    call __truncsfbf2
@@ -508,8 +508,8 @@ define bfloat @bfloat_load(ptr %a) nounwind {
 ; RV64ID-LP64D-NEXT:    lhu a1, 6(a0)
 ; RV64ID-LP64D-NEXT:    lhu a0, 0(a0)
 ; RV64ID-LP64D-NEXT:    slli a1, a1, 16
-; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64D-NEXT:    slli a0, a0, 16
+; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64D-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-LP64D-NEXT:    fadd.s fa0, fa4, fa5
 ; RV64ID-LP64D-NEXT:    call __truncsfbf2
@@ -569,8 +569,8 @@ define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind {
 ; RV32ID-ILP32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    mv s0, a0
 ; RV32ID-ILP32-NEXT:    slli a2, a2, 16
-; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a2
 ; RV32ID-ILP32-NEXT:    slli a1, a1, 16
+; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a2
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a1
 ; RV32ID-ILP32-NEXT:    fadd.s fa5, fa4, fa5
 ; RV32ID-ILP32-NEXT:    fmv.x.w a0, fa5
@@ -589,8 +589,8 @@ define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind {
 ; RV64ID-LP64-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
 ; RV64ID-LP64-NEXT:    mv s0, a0
 ; RV64ID-LP64-NEXT:    slli a2, a2, 16
-; RV64ID-LP64-NEXT:    fmv.w.x fa5, a2
 ; RV64ID-LP64-NEXT:    slli a1, a1, 16
+; RV64ID-LP64-NEXT:    fmv.w.x fa5, a2
 ; RV64ID-LP64-NEXT:    fmv.w.x fa4, a1
 ; RV64ID-LP64-NEXT:    fadd.s fa5, fa4, fa5
 ; RV64ID-LP64-NEXT:    fmv.x.w a0, fa5
@@ -611,8 +611,8 @@ define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind {
 ; RV32ID-ILP32D-NEXT:    fmv.x.w a0, fa0
 ; RV32ID-ILP32D-NEXT:    fmv.x.w a1, fa1
 ; RV32ID-ILP32D-NEXT:    slli a1, a1, 16
-; RV32ID-ILP32D-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32D-NEXT:    slli a0, a0, 16
+; RV32ID-ILP32D-NEXT:    fmv.w.x fa5, a1
 ; RV32ID-ILP32D-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-ILP32D-NEXT:    fadd.s fa0, fa4, fa5
 ; RV32ID-ILP32D-NEXT:    call __truncsfbf2
@@ -633,8 +633,8 @@ define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind {
 ; RV64ID-LP64D-NEXT:    fmv.x.w a0, fa0
 ; RV64ID-LP64D-NEXT:    fmv.x.w a1, fa1
 ; RV64ID-LP64D-NEXT:    slli a1, a1, 16
-; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64D-NEXT:    slli a0, a0, 16
+; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64D-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-LP64D-NEXT:    fadd.s fa0, fa4, fa5
 ; RV64ID-LP64D-NEXT:    call __truncsfbf2

diff  --git a/llvm/test/CodeGen/RISCV/bitextract-mac.ll b/llvm/test/CodeGen/RISCV/bitextract-mac.ll
index ce1e0c4711ffbe..41a32656e32578 100644
--- a/llvm/test/CodeGen/RISCV/bitextract-mac.ll
+++ b/llvm/test/CodeGen/RISCV/bitextract-mac.ll
@@ -25,8 +25,8 @@ define i32 @f(i32 %A, i32 %B, i32 %C) {
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    mul a0, a1, a0
 ; RV32I-NEXT:    slli a1, a0, 26
-; RV32I-NEXT:    srli a1, a1, 28
 ; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a1, a1, 28
 ; RV32I-NEXT:    srli a0, a0, 25
 ; RV32I-NEXT:    mul a0, a1, a0
 ; RV32I-NEXT:    add a0, a0, a2
@@ -36,8 +36,8 @@ define i32 @f(i32 %A, i32 %B, i32 %C) {
 ; RV32ZBB:       # %bb.0: # %entry
 ; RV32ZBB-NEXT:    mul a0, a1, a0
 ; RV32ZBB-NEXT:    slli a1, a0, 26
-; RV32ZBB-NEXT:    srli a1, a1, 28
 ; RV32ZBB-NEXT:    slli a0, a0, 20
+; RV32ZBB-NEXT:    srli a1, a1, 28
 ; RV32ZBB-NEXT:    srli a0, a0, 25
 ; RV32ZBB-NEXT:    mul a0, a1, a0
 ; RV32ZBB-NEXT:    add a0, a0, a2
@@ -56,8 +56,8 @@ define i32 @f(i32 %A, i32 %B, i32 %C) {
 ; RV32XTHEADMAC:       # %bb.0: # %entry
 ; RV32XTHEADMAC-NEXT:    mul a0, a1, a0
 ; RV32XTHEADMAC-NEXT:    slli a1, a0, 26
-; RV32XTHEADMAC-NEXT:    srli a1, a1, 28
 ; RV32XTHEADMAC-NEXT:    slli a0, a0, 20
+; RV32XTHEADMAC-NEXT:    srli a1, a1, 28
 ; RV32XTHEADMAC-NEXT:    srli a0, a0, 25
 ; RV32XTHEADMAC-NEXT:    th.mulah a2, a1, a0
 ; RV32XTHEADMAC-NEXT:    mv a0, a2
@@ -76,8 +76,8 @@ define i32 @f(i32 %A, i32 %B, i32 %C) {
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    mul a0, a1, a0
 ; RV64I-NEXT:    slli a1, a0, 58
-; RV64I-NEXT:    srli a1, a1, 60
 ; RV64I-NEXT:    slli a0, a0, 52
+; RV64I-NEXT:    srli a1, a1, 60
 ; RV64I-NEXT:    srli a0, a0, 57
 ; RV64I-NEXT:    mul a0, a1, a0
 ; RV64I-NEXT:    addw a0, a0, a2
@@ -87,8 +87,8 @@ define i32 @f(i32 %A, i32 %B, i32 %C) {
 ; RV64ZBB:       # %bb.0: # %entry
 ; RV64ZBB-NEXT:    mul a0, a1, a0
 ; RV64ZBB-NEXT:    slli a1, a0, 58
-; RV64ZBB-NEXT:    srli a1, a1, 60
 ; RV64ZBB-NEXT:    slli a0, a0, 52
+; RV64ZBB-NEXT:    srli a1, a1, 60
 ; RV64ZBB-NEXT:    srli a0, a0, 57
 ; RV64ZBB-NEXT:    mul a0, a1, a0
 ; RV64ZBB-NEXT:    addw a0, a0, a2
@@ -98,8 +98,8 @@ define i32 @f(i32 %A, i32 %B, i32 %C) {
 ; RV64XTHEADMAC:       # %bb.0: # %entry
 ; RV64XTHEADMAC-NEXT:    mul a0, a1, a0
 ; RV64XTHEADMAC-NEXT:    slli a1, a0, 58
-; RV64XTHEADMAC-NEXT:    srli a1, a1, 60
 ; RV64XTHEADMAC-NEXT:    slli a0, a0, 52
+; RV64XTHEADMAC-NEXT:    srli a1, a1, 60
 ; RV64XTHEADMAC-NEXT:    srli a0, a0, 57
 ; RV64XTHEADMAC-NEXT:    th.mulah a2, a1, a0
 ; RV64XTHEADMAC-NEXT:    mv a0, a2

diff  --git a/llvm/test/CodeGen/RISCV/bittest.ll b/llvm/test/CodeGen/RISCV/bittest.ll
index d280e5ee46b7c0..d69ab0550a0344 100644
--- a/llvm/test/CodeGen/RISCV/bittest.ll
+++ b/llvm/test/CodeGen/RISCV/bittest.ll
@@ -751,10 +751,10 @@ define signext i32 @bit_31_nz_select_i32(i32 signext %a, i32 signext %b, i32 sig
 define i64 @bit_10_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_10_z_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    andi a6, a0, 1024
 ; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    andi a3, a0, 1024
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    beqz a6, .LBB23_2
+; RV32-NEXT:    beqz a3, .LBB23_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
 ; RV32-NEXT:    mv a1, a5
@@ -779,11 +779,11 @@ define i64 @bit_10_z_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_10_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32I-LABEL: bit_10_nz_select_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    slli a0, a0, 21
-; RV32I-NEXT:    srli a6, a0, 31
 ; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    slli a0, a0, 21
+; RV32I-NEXT:    srli a3, a0, 31
 ; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    bnez a6, .LBB24_2
+; RV32I-NEXT:    bnez a3, .LBB24_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a4
 ; RV32I-NEXT:    mv a1, a5
@@ -802,10 +802,10 @@ define i64 @bit_10_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ;
 ; RV32ZBS-LABEL: bit_10_nz_select_i64:
 ; RV32ZBS:       # %bb.0:
-; RV32ZBS-NEXT:    bexti a6, a0, 10
 ; RV32ZBS-NEXT:    mv a1, a3
+; RV32ZBS-NEXT:    bexti a3, a0, 10
 ; RV32ZBS-NEXT:    mv a0, a2
-; RV32ZBS-NEXT:    bnez a6, .LBB24_2
+; RV32ZBS-NEXT:    bnez a3, .LBB24_2
 ; RV32ZBS-NEXT:  # %bb.1:
 ; RV32ZBS-NEXT:    mv a0, a4
 ; RV32ZBS-NEXT:    mv a1, a5
@@ -814,10 +814,10 @@ define i64 @bit_10_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ;
 ; RV32XTHEADBS-LABEL: bit_10_nz_select_i64:
 ; RV32XTHEADBS:       # %bb.0:
-; RV32XTHEADBS-NEXT:    th.tst a6, a0, 10
 ; RV32XTHEADBS-NEXT:    mv a1, a3
+; RV32XTHEADBS-NEXT:    th.tst a3, a0, 10
 ; RV32XTHEADBS-NEXT:    mv a0, a2
-; RV32XTHEADBS-NEXT:    bnez a6, .LBB24_2
+; RV32XTHEADBS-NEXT:    bnez a3, .LBB24_2
 ; RV32XTHEADBS-NEXT:  # %bb.1:
 ; RV32XTHEADBS-NEXT:    mv a0, a4
 ; RV32XTHEADBS-NEXT:    mv a1, a5
@@ -832,10 +832,10 @@ define i64 @bit_10_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_11_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_11_z_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a6, a0, 20
 ; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    slli a3, a0, 20
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bgez a6, .LBB25_2
+; RV32-NEXT:    bgez a3, .LBB25_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
 ; RV32-NEXT:    mv a1, a5
@@ -860,11 +860,11 @@ define i64 @bit_11_z_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_11_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32I-LABEL: bit_11_nz_select_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    slli a0, a0, 20
-; RV32I-NEXT:    srli a6, a0, 31
 ; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a3, a0, 31
 ; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    bnez a6, .LBB26_2
+; RV32I-NEXT:    bnez a3, .LBB26_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a4
 ; RV32I-NEXT:    mv a1, a5
@@ -883,10 +883,10 @@ define i64 @bit_11_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ;
 ; RV32ZBS-LABEL: bit_11_nz_select_i64:
 ; RV32ZBS:       # %bb.0:
-; RV32ZBS-NEXT:    bexti a6, a0, 11
 ; RV32ZBS-NEXT:    mv a1, a3
+; RV32ZBS-NEXT:    bexti a3, a0, 11
 ; RV32ZBS-NEXT:    mv a0, a2
-; RV32ZBS-NEXT:    bnez a6, .LBB26_2
+; RV32ZBS-NEXT:    bnez a3, .LBB26_2
 ; RV32ZBS-NEXT:  # %bb.1:
 ; RV32ZBS-NEXT:    mv a0, a4
 ; RV32ZBS-NEXT:    mv a1, a5
@@ -895,10 +895,10 @@ define i64 @bit_11_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ;
 ; RV32XTHEADBS-LABEL: bit_11_nz_select_i64:
 ; RV32XTHEADBS:       # %bb.0:
-; RV32XTHEADBS-NEXT:    th.tst a6, a0, 11
 ; RV32XTHEADBS-NEXT:    mv a1, a3
+; RV32XTHEADBS-NEXT:    th.tst a3, a0, 11
 ; RV32XTHEADBS-NEXT:    mv a0, a2
-; RV32XTHEADBS-NEXT:    bnez a6, .LBB26_2
+; RV32XTHEADBS-NEXT:    bnez a3, .LBB26_2
 ; RV32XTHEADBS-NEXT:  # %bb.1:
 ; RV32XTHEADBS-NEXT:    mv a0, a4
 ; RV32XTHEADBS-NEXT:    mv a1, a5
@@ -913,10 +913,10 @@ define i64 @bit_11_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_20_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_20_z_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a6, a0, 11
 ; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    slli a3, a0, 11
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bgez a6, .LBB27_2
+; RV32-NEXT:    bgez a3, .LBB27_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
 ; RV32-NEXT:    mv a1, a5
@@ -941,11 +941,11 @@ define i64 @bit_20_z_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_20_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32I-LABEL: bit_20_nz_select_i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    slli a0, a0, 11
-; RV32I-NEXT:    srli a6, a0, 31
 ; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    slli a0, a0, 11
+; RV32I-NEXT:    srli a3, a0, 31
 ; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    bnez a6, .LBB28_2
+; RV32I-NEXT:    bnez a3, .LBB28_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a4
 ; RV32I-NEXT:    mv a1, a5
@@ -964,10 +964,10 @@ define i64 @bit_20_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ;
 ; RV32ZBS-LABEL: bit_20_nz_select_i64:
 ; RV32ZBS:       # %bb.0:
-; RV32ZBS-NEXT:    bexti a6, a0, 20
 ; RV32ZBS-NEXT:    mv a1, a3
+; RV32ZBS-NEXT:    bexti a3, a0, 20
 ; RV32ZBS-NEXT:    mv a0, a2
-; RV32ZBS-NEXT:    bnez a6, .LBB28_2
+; RV32ZBS-NEXT:    bnez a3, .LBB28_2
 ; RV32ZBS-NEXT:  # %bb.1:
 ; RV32ZBS-NEXT:    mv a0, a4
 ; RV32ZBS-NEXT:    mv a1, a5
@@ -976,10 +976,10 @@ define i64 @bit_20_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ;
 ; RV32XTHEADBS-LABEL: bit_20_nz_select_i64:
 ; RV32XTHEADBS:       # %bb.0:
-; RV32XTHEADBS-NEXT:    th.tst a6, a0, 20
 ; RV32XTHEADBS-NEXT:    mv a1, a3
+; RV32XTHEADBS-NEXT:    th.tst a3, a0, 20
 ; RV32XTHEADBS-NEXT:    mv a0, a2
-; RV32XTHEADBS-NEXT:    bnez a6, .LBB28_2
+; RV32XTHEADBS-NEXT:    bnez a3, .LBB28_2
 ; RV32XTHEADBS-NEXT:  # %bb.1:
 ; RV32XTHEADBS-NEXT:    mv a0, a4
 ; RV32XTHEADBS-NEXT:    mv a1, a5
@@ -1021,10 +1021,10 @@ define i64 @bit_31_z_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_31_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_31_nz_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    srli a6, a0, 31
 ; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    srli a3, a0, 31
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bnez a6, .LBB30_2
+; RV32-NEXT:    bnez a3, .LBB30_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
 ; RV32-NEXT:    mv a1, a5
@@ -1049,14 +1049,14 @@ define i64 @bit_31_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_32_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_32_z_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    andi a6, a1, 1
-; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    andi a1, a1, 1
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    beqz a6, .LBB31_2
+; RV32-NEXT:    beqz a1, .LBB31_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
-; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:  .LBB31_2:
+; RV32-NEXT:    mv a1, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_32_z_select_i64:
@@ -1077,14 +1077,14 @@ define i64 @bit_32_z_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_32_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_32_nz_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    andi a6, a1, 1
-; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    andi a1, a1, 1
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bnez a6, .LBB32_2
+; RV32-NEXT:    bnez a1, .LBB32_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
-; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:  .LBB32_2:
+; RV32-NEXT:    mv a1, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_32_nz_select_i64:
@@ -1105,14 +1105,14 @@ define i64 @bit_32_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_55_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_55_z_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a6, a1, 8
-; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    slli a1, a1, 8
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bgez a6, .LBB33_2
+; RV32-NEXT:    bgez a1, .LBB33_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
-; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:  .LBB33_2:
+; RV32-NEXT:    mv a1, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_55_z_select_i64:
@@ -1134,14 +1134,14 @@ define i64 @bit_55_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32I-LABEL: bit_55_nz_select_i64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    srli a6, a1, 31
-; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    srli a1, a1, 31
 ; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    bnez a6, .LBB34_2
+; RV32I-NEXT:    bnez a1, .LBB34_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB34_2:
+; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64-LABEL: bit_55_nz_select_i64:
@@ -1156,26 +1156,26 @@ define i64 @bit_55_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ;
 ; RV32ZBS-LABEL: bit_55_nz_select_i64:
 ; RV32ZBS:       # %bb.0:
-; RV32ZBS-NEXT:    bexti a6, a1, 23
-; RV32ZBS-NEXT:    mv a1, a3
+; RV32ZBS-NEXT:    bexti a1, a1, 23
 ; RV32ZBS-NEXT:    mv a0, a2
-; RV32ZBS-NEXT:    bnez a6, .LBB34_2
+; RV32ZBS-NEXT:    bnez a1, .LBB34_2
 ; RV32ZBS-NEXT:  # %bb.1:
 ; RV32ZBS-NEXT:    mv a0, a4
-; RV32ZBS-NEXT:    mv a1, a5
+; RV32ZBS-NEXT:    mv a3, a5
 ; RV32ZBS-NEXT:  .LBB34_2:
+; RV32ZBS-NEXT:    mv a1, a3
 ; RV32ZBS-NEXT:    ret
 ;
 ; RV32XTHEADBS-LABEL: bit_55_nz_select_i64:
 ; RV32XTHEADBS:       # %bb.0:
-; RV32XTHEADBS-NEXT:    th.tst a6, a1, 23
-; RV32XTHEADBS-NEXT:    mv a1, a3
+; RV32XTHEADBS-NEXT:    th.tst a1, a1, 23
 ; RV32XTHEADBS-NEXT:    mv a0, a2
-; RV32XTHEADBS-NEXT:    bnez a6, .LBB34_2
+; RV32XTHEADBS-NEXT:    bnez a1, .LBB34_2
 ; RV32XTHEADBS-NEXT:  # %bb.1:
 ; RV32XTHEADBS-NEXT:    mv a0, a4
-; RV32XTHEADBS-NEXT:    mv a1, a5
+; RV32XTHEADBS-NEXT:    mv a3, a5
 ; RV32XTHEADBS-NEXT:  .LBB34_2:
+; RV32XTHEADBS-NEXT:    mv a1, a3
 ; RV32XTHEADBS-NEXT:    ret
   %1 = and i64 %a, 36028797018963968
   %2 = icmp ne i64 %1, 0
@@ -1212,14 +1212,14 @@ define i64 @bit_63_z_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_63_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_63_nz_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    srli a6, a1, 31
-; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bnez a6, .LBB36_2
+; RV32-NEXT:    bnez a1, .LBB36_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
-; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:  .LBB36_2:
+; RV32-NEXT:    mv a1, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_63_nz_select_i64:
@@ -2108,10 +2108,10 @@ define signext i32 @bit_32_1_nz_select_i32(i32 signext %a, i32 signext %b, i32 s
 define i64 @bit_10_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_10_1_z_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    andi a6, a0, 1023
 ; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    andi a3, a0, 1023
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    beqz a6, .LBB71_2
+; RV32-NEXT:    beqz a3, .LBB71_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
 ; RV32-NEXT:    mv a1, a5
@@ -2136,10 +2136,10 @@ define i64 @bit_10_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_10_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_10_1_nz_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    andi a6, a0, 1023
 ; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    andi a3, a0, 1023
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bnez a6, .LBB72_2
+; RV32-NEXT:    bnez a3, .LBB72_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
 ; RV32-NEXT:    mv a1, a5
@@ -2164,10 +2164,10 @@ define i64 @bit_10_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_11_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_11_1_z_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    andi a6, a0, 2047
 ; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    andi a3, a0, 2047
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    beqz a6, .LBB73_2
+; RV32-NEXT:    beqz a3, .LBB73_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
 ; RV32-NEXT:    mv a1, a5
@@ -2192,10 +2192,10 @@ define i64 @bit_11_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_11_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_11_1_nz_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    andi a6, a0, 2047
 ; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    andi a3, a0, 2047
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bnez a6, .LBB74_2
+; RV32-NEXT:    bnez a3, .LBB74_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
 ; RV32-NEXT:    mv a1, a5
@@ -2220,10 +2220,10 @@ define i64 @bit_11_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_16_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_16_1_z_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a6, a0, 16
 ; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    slli a3, a0, 16
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    beqz a6, .LBB75_2
+; RV32-NEXT:    beqz a3, .LBB75_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
 ; RV32-NEXT:    mv a1, a5
@@ -2276,10 +2276,10 @@ define i64 @bit_16_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_20_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_20_1_z_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a6, a0, 12
 ; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    slli a3, a0, 12
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    beqz a6, .LBB77_2
+; RV32-NEXT:    beqz a3, .LBB77_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
 ; RV32-NEXT:    mv a1, a5
@@ -2304,10 +2304,10 @@ define i64 @bit_20_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_20_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_20_1_nz_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a6, a0, 12
 ; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    slli a3, a0, 12
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bnez a6, .LBB78_2
+; RV32-NEXT:    bnez a3, .LBB78_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
 ; RV32-NEXT:    mv a1, a5
@@ -2332,10 +2332,10 @@ define i64 @bit_20_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_31_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_31_1_z_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a6, a0, 1
 ; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    slli a3, a0, 1
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    beqz a6, .LBB79_2
+; RV32-NEXT:    beqz a3, .LBB79_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
 ; RV32-NEXT:    mv a1, a5
@@ -2360,10 +2360,10 @@ define i64 @bit_31_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_31_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_31_1_nz_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slli a6, a0, 1
 ; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    slli a3, a0, 1
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bnez a6, .LBB80_2
+; RV32-NEXT:    bnez a3, .LBB80_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
 ; RV32-NEXT:    mv a1, a5
@@ -2444,14 +2444,14 @@ define i64 @bit_55_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a1, a1, 9
 ; RV32-NEXT:    srli a1, a1, 9
-; RV32-NEXT:    or a6, a0, a1
-; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    or a1, a0, a1
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    beqz a6, .LBB83_2
+; RV32-NEXT:    beqz a1, .LBB83_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
-; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:  .LBB83_2:
+; RV32-NEXT:    mv a1, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_55_1_z_select_i64:
@@ -2474,14 +2474,14 @@ define i64 @bit_55_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a1, a1, 9
 ; RV32-NEXT:    srli a1, a1, 9
-; RV32-NEXT:    or a6, a0, a1
-; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    or a1, a0, a1
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bnez a6, .LBB84_2
+; RV32-NEXT:    bnez a1, .LBB84_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
-; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:  .LBB84_2:
+; RV32-NEXT:    mv a1, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_55_1_nz_select_i64:
@@ -2504,14 +2504,14 @@ define i64 @bit_63_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 1
 ; RV32I-NEXT:    srli a1, a1, 1
-; RV32I-NEXT:    or a6, a0, a1
-; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    or a1, a0, a1
 ; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    beqz a6, .LBB85_2
+; RV32I-NEXT:    beqz a1, .LBB85_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB85_2:
+; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64-LABEL: bit_63_1_z_select_i64:
@@ -2527,28 +2527,28 @@ define i64 @bit_63_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32ZBS-LABEL: bit_63_1_z_select_i64:
 ; RV32ZBS:       # %bb.0:
 ; RV32ZBS-NEXT:    bclri a1, a1, 31
-; RV32ZBS-NEXT:    or a6, a0, a1
-; RV32ZBS-NEXT:    mv a1, a3
+; RV32ZBS-NEXT:    or a1, a0, a1
 ; RV32ZBS-NEXT:    mv a0, a2
-; RV32ZBS-NEXT:    beqz a6, .LBB85_2
+; RV32ZBS-NEXT:    beqz a1, .LBB85_2
 ; RV32ZBS-NEXT:  # %bb.1:
 ; RV32ZBS-NEXT:    mv a0, a4
-; RV32ZBS-NEXT:    mv a1, a5
+; RV32ZBS-NEXT:    mv a3, a5
 ; RV32ZBS-NEXT:  .LBB85_2:
+; RV32ZBS-NEXT:    mv a1, a3
 ; RV32ZBS-NEXT:    ret
 ;
 ; RV32XTHEADBS-LABEL: bit_63_1_z_select_i64:
 ; RV32XTHEADBS:       # %bb.0:
 ; RV32XTHEADBS-NEXT:    slli a1, a1, 1
 ; RV32XTHEADBS-NEXT:    srli a1, a1, 1
-; RV32XTHEADBS-NEXT:    or a6, a0, a1
-; RV32XTHEADBS-NEXT:    mv a1, a3
+; RV32XTHEADBS-NEXT:    or a1, a0, a1
 ; RV32XTHEADBS-NEXT:    mv a0, a2
-; RV32XTHEADBS-NEXT:    beqz a6, .LBB85_2
+; RV32XTHEADBS-NEXT:    beqz a1, .LBB85_2
 ; RV32XTHEADBS-NEXT:  # %bb.1:
 ; RV32XTHEADBS-NEXT:    mv a0, a4
-; RV32XTHEADBS-NEXT:    mv a1, a5
+; RV32XTHEADBS-NEXT:    mv a3, a5
 ; RV32XTHEADBS-NEXT:  .LBB85_2:
+; RV32XTHEADBS-NEXT:    mv a1, a3
 ; RV32XTHEADBS-NEXT:    ret
   %1 = and i64 %a, 9223372036854775807
   %2 = icmp eq i64 %1, 0
@@ -2561,14 +2561,14 @@ define i64 @bit_63_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a1, 1
 ; RV32I-NEXT:    srli a1, a1, 1
-; RV32I-NEXT:    or a6, a0, a1
-; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    or a1, a0, a1
 ; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    bnez a6, .LBB86_2
+; RV32I-NEXT:    bnez a1, .LBB86_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB86_2:
+; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64-LABEL: bit_63_1_nz_select_i64:
@@ -2584,28 +2584,28 @@ define i64 @bit_63_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32ZBS-LABEL: bit_63_1_nz_select_i64:
 ; RV32ZBS:       # %bb.0:
 ; RV32ZBS-NEXT:    bclri a1, a1, 31
-; RV32ZBS-NEXT:    or a6, a0, a1
-; RV32ZBS-NEXT:    mv a1, a3
+; RV32ZBS-NEXT:    or a1, a0, a1
 ; RV32ZBS-NEXT:    mv a0, a2
-; RV32ZBS-NEXT:    bnez a6, .LBB86_2
+; RV32ZBS-NEXT:    bnez a1, .LBB86_2
 ; RV32ZBS-NEXT:  # %bb.1:
 ; RV32ZBS-NEXT:    mv a0, a4
-; RV32ZBS-NEXT:    mv a1, a5
+; RV32ZBS-NEXT:    mv a3, a5
 ; RV32ZBS-NEXT:  .LBB86_2:
+; RV32ZBS-NEXT:    mv a1, a3
 ; RV32ZBS-NEXT:    ret
 ;
 ; RV32XTHEADBS-LABEL: bit_63_1_nz_select_i64:
 ; RV32XTHEADBS:       # %bb.0:
 ; RV32XTHEADBS-NEXT:    slli a1, a1, 1
 ; RV32XTHEADBS-NEXT:    srli a1, a1, 1
-; RV32XTHEADBS-NEXT:    or a6, a0, a1
-; RV32XTHEADBS-NEXT:    mv a1, a3
+; RV32XTHEADBS-NEXT:    or a1, a0, a1
 ; RV32XTHEADBS-NEXT:    mv a0, a2
-; RV32XTHEADBS-NEXT:    bnez a6, .LBB86_2
+; RV32XTHEADBS-NEXT:    bnez a1, .LBB86_2
 ; RV32XTHEADBS-NEXT:  # %bb.1:
 ; RV32XTHEADBS-NEXT:    mv a0, a4
-; RV32XTHEADBS-NEXT:    mv a1, a5
+; RV32XTHEADBS-NEXT:    mv a3, a5
 ; RV32XTHEADBS-NEXT:  .LBB86_2:
+; RV32XTHEADBS-NEXT:    mv a1, a3
 ; RV32XTHEADBS-NEXT:    ret
   %1 = and i64 %a, 9223372036854775807
   %2 = icmp ne i64 %1, 0
@@ -2616,14 +2616,14 @@ define i64 @bit_63_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_64_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_64_1_z_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    or a6, a0, a1
-; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    or a1, a0, a1
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    beqz a6, .LBB87_2
+; RV32-NEXT:    beqz a1, .LBB87_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
-; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:  .LBB87_2:
+; RV32-NEXT:    mv a1, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_64_1_z_select_i64:
@@ -2643,14 +2643,14 @@ define i64 @bit_64_1_z_select_i64(i64 %a, i64 %b, i64 %c) {
 define i64 @bit_64_1_nz_select_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32-LABEL: bit_64_1_nz_select_i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    or a6, a0, a1
-; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    or a1, a0, a1
 ; RV32-NEXT:    mv a0, a2
-; RV32-NEXT:    bnez a6, .LBB88_2
+; RV32-NEXT:    bnez a1, .LBB88_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a4
-; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    mv a3, a5
 ; RV32-NEXT:  .LBB88_2:
+; RV32-NEXT:    mv a1, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: bit_64_1_nz_select_i64:

diff  --git a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
index 9c69fe0a6e4865..40a57721423453 100644
--- a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
+++ b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll
@@ -57,11 +57,11 @@ define i32 @test_bswap_i32(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    srli a3, a0, 24
 ; RV32I-NEXT:    addi a2, a2, -256
 ; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    and a2, a0, a2
+; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    slli a2, a2, 8
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a2
@@ -72,11 +72,11 @@ define i32 @test_bswap_i32(i32 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    srliw a3, a0, 24
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    and a2, a0, a2
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    slliw a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a2
@@ -102,53 +102,52 @@ define i64 @test_bswap_i64(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a1, 8
 ; RV32I-NEXT:    lui a3, 16
+; RV32I-NEXT:    srli a4, a1, 24
+; RV32I-NEXT:    srli a5, a0, 8
 ; RV32I-NEXT:    addi a3, a3, -256
 ; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    srli a4, a1, 24
 ; RV32I-NEXT:    or a2, a2, a4
-; RV32I-NEXT:    and a4, a1, a3
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a4
-; RV32I-NEXT:    or a2, a1, a2
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    srli a4, a0, 24
-; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    and a5, a5, a3
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a5, a1, 24
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or a1, a5, a1
 ; RV32I-NEXT:    and a3, a0, a3
-; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a0, a3
+; RV32I-NEXT:    or a0, a1, a2
+; RV32I-NEXT:    or a1, a3, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_bswap_i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a0, 40
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    srli a3, a0, 56
+; RV64I-NEXT:    srli a4, a0, 24
+; RV64I-NEXT:    lui a5, 4080
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srli a3, a0, 56
 ; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    srli a3, a0, 24
-; RV64I-NEXT:    lui a4, 4080
-; RV64I-NEXT:    and a3, a3, a4
-; RV64I-NEXT:    srli a5, a0, 8
-; RV64I-NEXT:    srliw a5, a5, 24
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    and a4, a0, a4
-; RV64I-NEXT:    slli a4, a4, 24
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    and a4, a4, a5
+; RV64I-NEXT:    srliw a3, a3, 24
+; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    srliw a4, a0, 24
+; RV64I-NEXT:    and a5, a0, a5
 ; RV64I-NEXT:    and a2, a0, a2
-; RV64I-NEXT:    slli a2, a2, 40
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a2, a2, 40
+; RV64I-NEXT:    or a1, a3, a1
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
@@ -171,18 +170,18 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; RV32I-LABEL: test_bitreverse_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    andi a1, a0, 15
-; RV32I-NEXT:    slli a1, a1, 4
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    slli a1, a1, 4
 ; RV32I-NEXT:    srli a0, a0, 28
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    andi a1, a0, 51
-; RV32I-NEXT:    slli a1, a1, 2
 ; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    slli a1, a1, 2
 ; RV32I-NEXT:    andi a0, a0, 51
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    andi a1, a0, 85
-; RV32I-NEXT:    slli a1, a1, 1
 ; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    slli a1, a1, 1
 ; RV32I-NEXT:    andi a0, a0, 85
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -190,18 +189,18 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; RV64I-LABEL: test_bitreverse_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    andi a1, a0, 15
-; RV64I-NEXT:    slli a1, a1, 4
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    slli a1, a1, 4
 ; RV64I-NEXT:    srli a0, a0, 60
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    andi a1, a0, 51
-; RV64I-NEXT:    slli a1, a1, 2
 ; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    slli a1, a1, 2
 ; RV64I-NEXT:    andi a0, a0, 51
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    andi a1, a0, 85
-; RV64I-NEXT:    slli a1, a1, 1
 ; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    slli a1, a1, 1
 ; RV64I-NEXT:    andi a0, a0, 85
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -209,18 +208,18 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; RV32ZBB-LABEL: test_bitreverse_i8:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    andi a1, a0, 15
-; RV32ZBB-NEXT:    slli a1, a1, 4
 ; RV32ZBB-NEXT:    slli a0, a0, 24
+; RV32ZBB-NEXT:    slli a1, a1, 4
 ; RV32ZBB-NEXT:    srli a0, a0, 28
 ; RV32ZBB-NEXT:    or a0, a0, a1
 ; RV32ZBB-NEXT:    andi a1, a0, 51
-; RV32ZBB-NEXT:    slli a1, a1, 2
 ; RV32ZBB-NEXT:    srli a0, a0, 2
+; RV32ZBB-NEXT:    slli a1, a1, 2
 ; RV32ZBB-NEXT:    andi a0, a0, 51
 ; RV32ZBB-NEXT:    or a0, a0, a1
 ; RV32ZBB-NEXT:    andi a1, a0, 85
-; RV32ZBB-NEXT:    slli a1, a1, 1
 ; RV32ZBB-NEXT:    srli a0, a0, 1
+; RV32ZBB-NEXT:    slli a1, a1, 1
 ; RV32ZBB-NEXT:    andi a0, a0, 85
 ; RV32ZBB-NEXT:    or a0, a0, a1
 ; RV32ZBB-NEXT:    ret
@@ -228,18 +227,18 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
 ; RV64ZBB-LABEL: test_bitreverse_i8:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    andi a1, a0, 15
-; RV64ZBB-NEXT:    slli a1, a1, 4
 ; RV64ZBB-NEXT:    slli a0, a0, 56
+; RV64ZBB-NEXT:    slli a1, a1, 4
 ; RV64ZBB-NEXT:    srli a0, a0, 60
 ; RV64ZBB-NEXT:    or a0, a0, a1
 ; RV64ZBB-NEXT:    andi a1, a0, 51
-; RV64ZBB-NEXT:    slli a1, a1, 2
 ; RV64ZBB-NEXT:    srli a0, a0, 2
+; RV64ZBB-NEXT:    slli a1, a1, 2
 ; RV64ZBB-NEXT:    andi a0, a0, 51
 ; RV64ZBB-NEXT:    or a0, a0, a1
 ; RV64ZBB-NEXT:    andi a1, a0, 85
-; RV64ZBB-NEXT:    slli a1, a1, 1
 ; RV64ZBB-NEXT:    srli a0, a0, 1
+; RV64ZBB-NEXT:    slli a1, a1, 1
 ; RV64ZBB-NEXT:    andi a0, a0, 85
 ; RV64ZBB-NEXT:    or a0, a0, a1
 ; RV64ZBB-NEXT:    ret
@@ -266,27 +265,27 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    lui a2, 1
 ; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    addi a2, a2, -241
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
-; RV32I-NEXT:    lui a2, 1
-; RV32I-NEXT:    addi a2, a2, -241
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    slli a0, a0, 4
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    lui a2, 3
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    addi a2, a2, 1365
 ; RV32I-NEXT:    slli a0, a0, 2
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 5
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -295,27 +294,27 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    lui a2, 1
 ; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    lui a2, 1
-; RV64I-NEXT:    addiw a2, a2, -241
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 3
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    slli a0, a0, 4
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    lui a2, 3
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 5
+; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 5
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -323,25 +322,25 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; RV32ZBB-LABEL: test_bitreverse_i16:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    rev8 a0, a0
-; RV32ZBB-NEXT:    srli a1, a0, 12
-; RV32ZBB-NEXT:    lui a2, 15
-; RV32ZBB-NEXT:    addi a2, a2, 240
-; RV32ZBB-NEXT:    and a1, a1, a2
+; RV32ZBB-NEXT:    lui a1, 15
+; RV32ZBB-NEXT:    srli a2, a0, 12
+; RV32ZBB-NEXT:    addi a1, a1, 240
+; RV32ZBB-NEXT:    and a1, a2, a1
+; RV32ZBB-NEXT:    lui a2, 3
 ; RV32ZBB-NEXT:    srli a0, a0, 20
+; RV32ZBB-NEXT:    addi a2, a2, 819
 ; RV32ZBB-NEXT:    andi a0, a0, -241
 ; RV32ZBB-NEXT:    or a0, a0, a1
 ; RV32ZBB-NEXT:    srli a1, a0, 2
-; RV32ZBB-NEXT:    lui a2, 3
-; RV32ZBB-NEXT:    addi a2, a2, 819
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
+; RV32ZBB-NEXT:    lui a2, 5
+; RV32ZBB-NEXT:    addi a2, a2, 1365
 ; RV32ZBB-NEXT:    slli a0, a0, 2
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 1
-; RV32ZBB-NEXT:    lui a2, 5
-; RV32ZBB-NEXT:    addi a2, a2, 1365
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    slli a0, a0, 1
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    ret
@@ -349,25 +348,25 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
 ; RV64ZBB-LABEL: test_bitreverse_i16:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    rev8 a0, a0
-; RV64ZBB-NEXT:    srli a1, a0, 44
-; RV64ZBB-NEXT:    lui a2, 15
-; RV64ZBB-NEXT:    addiw a2, a2, 240
-; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    lui a1, 15
+; RV64ZBB-NEXT:    srli a2, a0, 44
+; RV64ZBB-NEXT:    addiw a1, a1, 240
+; RV64ZBB-NEXT:    and a1, a2, a1
+; RV64ZBB-NEXT:    lui a2, 3
 ; RV64ZBB-NEXT:    srli a0, a0, 52
+; RV64ZBB-NEXT:    addiw a2, a2, 819
 ; RV64ZBB-NEXT:    andi a0, a0, -241
 ; RV64ZBB-NEXT:    or a0, a0, a1
 ; RV64ZBB-NEXT:    srli a1, a0, 2
-; RV64ZBB-NEXT:    lui a2, 3
-; RV64ZBB-NEXT:    addiw a2, a2, 819
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    lui a2, 5
+; RV64ZBB-NEXT:    addiw a2, a2, 1365
 ; RV64ZBB-NEXT:    slli a0, a0, 2
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 1
-; RV64ZBB-NEXT:    lui a2, 5
-; RV64ZBB-NEXT:    addiw a2, a2, 1365
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    slli a0, a0, 1
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    ret
@@ -394,34 +393,34 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    srli a3, a0, 24
 ; RV32I-NEXT:    addi a2, a2, -256
 ; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    and a2, a0, a2
-; RV32I-NEXT:    slli a2, a2, 8
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    lui a3, 61681
+; RV32I-NEXT:    slli a2, a2, 8
 ; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a3, a3, -241
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 4
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    slli a0, a0, 4
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 2
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    slli a0, a0, 1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -430,34 +429,34 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    srliw a3, a0, 24
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    and a2, a0, a2
-; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    slliw a0, a0, 24
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a3, a3, -241
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    addiw a2, a2, -241
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    and a1, a1, a3
+; RV64I-NEXT:    lui a3, 349525
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a3, a3, 1365
 ; RV64I-NEXT:    slliw a0, a0, 4
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slliw a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    and a1, a1, a3
 ; RV64I-NEXT:    slliw a0, a0, 1
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -465,25 +464,25 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; RV32ZBB-LABEL: test_bitreverse_i32:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    rev8 a0, a0
-; RV32ZBB-NEXT:    srli a1, a0, 4
-; RV32ZBB-NEXT:    lui a2, 61681
-; RV32ZBB-NEXT:    addi a2, a2, -241
-; RV32ZBB-NEXT:    and a1, a1, a2
-; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    lui a1, 61681
+; RV32ZBB-NEXT:    srli a2, a0, 4
+; RV32ZBB-NEXT:    addi a1, a1, -241
+; RV32ZBB-NEXT:    and a2, a2, a1
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    lui a1, 209715
+; RV32ZBB-NEXT:    addi a1, a1, 819
 ; RV32ZBB-NEXT:    slli a0, a0, 4
-; RV32ZBB-NEXT:    or a0, a1, a0
-; RV32ZBB-NEXT:    srli a1, a0, 2
-; RV32ZBB-NEXT:    lui a2, 209715
-; RV32ZBB-NEXT:    addi a2, a2, 819
-; RV32ZBB-NEXT:    and a1, a1, a2
-; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    or a0, a2, a0
+; RV32ZBB-NEXT:    srli a2, a0, 2
+; RV32ZBB-NEXT:    and a0, a0, a1
+; RV32ZBB-NEXT:    and a1, a2, a1
+; RV32ZBB-NEXT:    lui a2, 349525
+; RV32ZBB-NEXT:    addi a2, a2, 1365
 ; RV32ZBB-NEXT:    slli a0, a0, 2
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 1
-; RV32ZBB-NEXT:    lui a2, 349525
-; RV32ZBB-NEXT:    addi a2, a2, 1365
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    slli a0, a0, 1
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    ret
@@ -491,28 +490,28 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
 ; RV64ZBB-LABEL: test_bitreverse_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    rev8 a0, a0
-; RV64ZBB-NEXT:    srli a1, a0, 36
-; RV64ZBB-NEXT:    lui a2, 61681
-; RV64ZBB-NEXT:    addiw a2, a2, -241
-; RV64ZBB-NEXT:    and a1, a1, a2
-; RV64ZBB-NEXT:    srli a0, a0, 28
+; RV64ZBB-NEXT:    lui a1, 61681
+; RV64ZBB-NEXT:    srli a2, a0, 36
+; RV64ZBB-NEXT:    addiw a1, a1, -241
+; RV64ZBB-NEXT:    and a1, a2, a1
 ; RV64ZBB-NEXT:    lui a2, 986895
+; RV64ZBB-NEXT:    srli a0, a0, 28
 ; RV64ZBB-NEXT:    addi a2, a2, 240
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    lui a2, 209715
+; RV64ZBB-NEXT:    addiw a2, a2, 819
 ; RV64ZBB-NEXT:    sext.w a0, a0
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 2
-; RV64ZBB-NEXT:    lui a2, 209715
-; RV64ZBB-NEXT:    addiw a2, a2, 819
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    lui a2, 349525
+; RV64ZBB-NEXT:    addiw a2, a2, 1365
 ; RV64ZBB-NEXT:    slliw a0, a0, 2
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 1
-; RV64ZBB-NEXT:    lui a2, 349525
-; RV64ZBB-NEXT:    addiw a2, a2, 1365
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    slliw a0, a0, 1
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    ret
@@ -538,115 +537,114 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a1, 8
 ; RV32I-NEXT:    lui a3, 16
+; RV32I-NEXT:    srli a4, a1, 24
+; RV32I-NEXT:    slli a5, a1, 24
+; RV32I-NEXT:    lui a6, 61681
+; RV32I-NEXT:    srli a7, a0, 8
 ; RV32I-NEXT:    addi a3, a3, -256
 ; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    srli a4, a1, 24
 ; RV32I-NEXT:    or a2, a2, a4
-; RV32I-NEXT:    and a4, a1, a3
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    srli a4, a0, 24
+; RV32I-NEXT:    and a7, a7, a3
+; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    lui a7, 209715
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or a1, a5, a1
+; RV32I-NEXT:    lui a5, 349525
+; RV32I-NEXT:    and a3, a0, a3
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    addi a6, a6, -241
+; RV32I-NEXT:    addi a7, a7, 819
+; RV32I-NEXT:    addi a5, a5, 1365
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:    srli a2, a1, 4
-; RV32I-NEXT:    lui a4, 61681
-; RV32I-NEXT:    addi a4, a4, -241
-; RV32I-NEXT:    and a2, a2, a4
-; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a1, a1, a6
+; RV32I-NEXT:    srli a3, a0, 4
+; RV32I-NEXT:    and a0, a0, a6
+; RV32I-NEXT:    and a2, a2, a6
 ; RV32I-NEXT:    slli a1, a1, 4
+; RV32I-NEXT:    and a3, a3, a6
+; RV32I-NEXT:    slli a0, a0, 4
 ; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    or a0, a3, a0
 ; RV32I-NEXT:    srli a2, a1, 2
-; RV32I-NEXT:    lui a5, 209715
-; RV32I-NEXT:    addi a5, a5, 819
-; RV32I-NEXT:    and a2, a2, a5
-; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    and a1, a1, a7
+; RV32I-NEXT:    srli a3, a0, 2
+; RV32I-NEXT:    and a0, a0, a7
+; RV32I-NEXT:    and a2, a2, a7
 ; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    and a3, a3, a7
+; RV32I-NEXT:    slli a0, a0, 2
 ; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    or a0, a3, a0
 ; RV32I-NEXT:    srli a2, a1, 1
-; RV32I-NEXT:    lui a6, 349525
-; RV32I-NEXT:    addi a6, a6, 1365
-; RV32I-NEXT:    and a2, a2, a6
-; RV32I-NEXT:    and a1, a1, a6
-; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    or a2, a2, a1
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    srli a7, a0, 24
-; RV32I-NEXT:    or a1, a1, a7
-; RV32I-NEXT:    and a3, a0, a3
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 4
-; RV32I-NEXT:    and a1, a1, a4
-; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    slli a0, a0, 4
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    srli a1, a0, 2
 ; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    srli a3, a0, 1
 ; RV32I-NEXT:    and a0, a0, a5
-; RV32I-NEXT:    slli a0, a0, 2
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    and a1, a1, a6
-; RV32I-NEXT:    and a0, a0, a6
-; RV32I-NEXT:    slli a0, a0, 1
-; RV32I-NEXT:    or a1, a1, a0
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    and a2, a2, a5
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    and a3, a3, a5
+; RV32I-NEXT:    slli a4, a0, 1
+; RV32I-NEXT:    or a0, a2, a1
+; RV32I-NEXT:    or a1, a3, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_bitreverse_i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a0, 40
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    srli a3, a0, 56
+; RV64I-NEXT:    srli a4, a0, 24
+; RV64I-NEXT:    lui a5, 4080
+; RV64I-NEXT:    srli a6, a0, 8
+; RV64I-NEXT:    srliw a7, a0, 24
+; RV64I-NEXT:    lui t0, 61681
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srli a3, a0, 56
 ; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    srli a3, a0, 24
-; RV64I-NEXT:    lui a4, 4080
-; RV64I-NEXT:    and a3, a3, a4
-; RV64I-NEXT:    srli a5, a0, 8
-; RV64I-NEXT:    srliw a5, a5, 24
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    and a4, a4, a5
+; RV64I-NEXT:    srliw a6, a6, 24
+; RV64I-NEXT:    slli a6, a6, 24
+; RV64I-NEXT:    or a4, a6, a4
+; RV64I-NEXT:    lui a6, 349525
+; RV64I-NEXT:    and a5, a0, a5
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    addiw t0, t0, -241
+; RV64I-NEXT:    addiw a3, a3, 819
+; RV64I-NEXT:    addiw a6, a6, 1365
 ; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    and a4, a0, a4
-; RV64I-NEXT:    slli a4, a4, 24
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a5, a5, a7
+; RV64I-NEXT:    slli a7, t0, 32
+; RV64I-NEXT:    add a7, t0, a7
+; RV64I-NEXT:    slli t0, a3, 32
+; RV64I-NEXT:    add a3, a3, t0
+; RV64I-NEXT:    slli t0, a6, 32
+; RV64I-NEXT:    add a6, a6, t0
+; RV64I-NEXT:    or a1, a4, a1
 ; RV64I-NEXT:    and a2, a0, a2
-; RV64I-NEXT:    slli a2, a2, 40
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    slli a2, a2, 40
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    addiw a2, a2, -241
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a7
+; RV64I-NEXT:    and a1, a1, a7
 ; RV64I-NEXT:    slli a0, a0, 4
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    and a1, a1, a3
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a6
+; RV64I-NEXT:    and a1, a1, a6
 ; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -654,74 +652,73 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; RV32ZBB-LABEL: test_bitreverse_i64:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    rev8 a1, a1
-; RV32ZBB-NEXT:    srli a2, a1, 4
-; RV32ZBB-NEXT:    lui a3, 61681
-; RV32ZBB-NEXT:    addi a3, a3, -241
-; RV32ZBB-NEXT:    and a2, a2, a3
-; RV32ZBB-NEXT:    and a1, a1, a3
-; RV32ZBB-NEXT:    slli a1, a1, 4
-; RV32ZBB-NEXT:    or a1, a2, a1
-; RV32ZBB-NEXT:    srli a2, a1, 2
-; RV32ZBB-NEXT:    lui a4, 209715
-; RV32ZBB-NEXT:    addi a4, a4, 819
-; RV32ZBB-NEXT:    and a2, a2, a4
-; RV32ZBB-NEXT:    and a1, a1, a4
-; RV32ZBB-NEXT:    slli a1, a1, 2
-; RV32ZBB-NEXT:    or a1, a2, a1
-; RV32ZBB-NEXT:    srli a2, a1, 1
-; RV32ZBB-NEXT:    lui a5, 349525
-; RV32ZBB-NEXT:    addi a5, a5, 1365
-; RV32ZBB-NEXT:    and a2, a2, a5
-; RV32ZBB-NEXT:    and a1, a1, a5
-; RV32ZBB-NEXT:    slli a1, a1, 1
-; RV32ZBB-NEXT:    or a2, a2, a1
+; RV32ZBB-NEXT:    lui a2, 61681
+; RV32ZBB-NEXT:    lui a3, 209715
 ; RV32ZBB-NEXT:    rev8 a0, a0
-; RV32ZBB-NEXT:    srli a1, a0, 4
+; RV32ZBB-NEXT:    srli a4, a1, 4
+; RV32ZBB-NEXT:    addi a2, a2, -241
+; RV32ZBB-NEXT:    srli a5, a0, 4
+; RV32ZBB-NEXT:    and a4, a4, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
+; RV32ZBB-NEXT:    and a5, a5, a2
+; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    lui a2, 349525
+; RV32ZBB-NEXT:    addi a3, a3, 819
+; RV32ZBB-NEXT:    addi a2, a2, 1365
+; RV32ZBB-NEXT:    slli a1, a1, 4
+; RV32ZBB-NEXT:    slli a0, a0, 4
+; RV32ZBB-NEXT:    or a1, a4, a1
+; RV32ZBB-NEXT:    or a0, a5, a0
+; RV32ZBB-NEXT:    srli a4, a1, 2
 ; RV32ZBB-NEXT:    and a1, a1, a3
+; RV32ZBB-NEXT:    srli a5, a0, 2
 ; RV32ZBB-NEXT:    and a0, a0, a3
-; RV32ZBB-NEXT:    slli a0, a0, 4
-; RV32ZBB-NEXT:    or a0, a1, a0
-; RV32ZBB-NEXT:    srli a1, a0, 2
-; RV32ZBB-NEXT:    and a1, a1, a4
-; RV32ZBB-NEXT:    and a0, a0, a4
+; RV32ZBB-NEXT:    and a4, a4, a3
+; RV32ZBB-NEXT:    slli a1, a1, 2
+; RV32ZBB-NEXT:    and a3, a5, a3
 ; RV32ZBB-NEXT:    slli a0, a0, 2
-; RV32ZBB-NEXT:    or a0, a1, a0
-; RV32ZBB-NEXT:    srli a1, a0, 1
-; RV32ZBB-NEXT:    and a1, a1, a5
-; RV32ZBB-NEXT:    and a0, a0, a5
-; RV32ZBB-NEXT:    slli a0, a0, 1
-; RV32ZBB-NEXT:    or a1, a1, a0
-; RV32ZBB-NEXT:    mv a0, a2
+; RV32ZBB-NEXT:    or a1, a4, a1
+; RV32ZBB-NEXT:    or a0, a3, a0
+; RV32ZBB-NEXT:    srli a3, a1, 1
+; RV32ZBB-NEXT:    and a1, a1, a2
+; RV32ZBB-NEXT:    srli a4, a0, 1
+; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a3, a3, a2
+; RV32ZBB-NEXT:    slli a1, a1, 1
+; RV32ZBB-NEXT:    and a2, a4, a2
+; RV32ZBB-NEXT:    slli a4, a0, 1
+; RV32ZBB-NEXT:    or a0, a3, a1
+; RV32ZBB-NEXT:    or a1, a2, a4
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: test_bitreverse_i64:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    rev8 a0, a0
-; RV64ZBB-NEXT:    srli a1, a0, 4
-; RV64ZBB-NEXT:    lui a2, 61681
-; RV64ZBB-NEXT:    addiw a2, a2, -241
-; RV64ZBB-NEXT:    slli a3, a2, 32
-; RV64ZBB-NEXT:    add a2, a2, a3
-; RV64ZBB-NEXT:    and a1, a1, a2
-; RV64ZBB-NEXT:    and a0, a0, a2
-; RV64ZBB-NEXT:    slli a0, a0, 4
-; RV64ZBB-NEXT:    or a0, a1, a0
-; RV64ZBB-NEXT:    srli a1, a0, 2
+; RV64ZBB-NEXT:    lui a1, 61681
 ; RV64ZBB-NEXT:    lui a2, 209715
+; RV64ZBB-NEXT:    lui a3, 349525
+; RV64ZBB-NEXT:    addiw a1, a1, -241
 ; RV64ZBB-NEXT:    addiw a2, a2, 819
-; RV64ZBB-NEXT:    slli a3, a2, 32
-; RV64ZBB-NEXT:    add a2, a2, a3
-; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    addiw a3, a3, 1365
+; RV64ZBB-NEXT:    slli a4, a1, 32
+; RV64ZBB-NEXT:    add a1, a1, a4
+; RV64ZBB-NEXT:    slli a4, a2, 32
+; RV64ZBB-NEXT:    add a2, a2, a4
+; RV64ZBB-NEXT:    slli a4, a3, 32
+; RV64ZBB-NEXT:    add a3, a3, a4
+; RV64ZBB-NEXT:    srli a4, a0, 4
+; RV64ZBB-NEXT:    and a4, a4, a1
+; RV64ZBB-NEXT:    and a0, a0, a1
+; RV64ZBB-NEXT:    slli a0, a0, 4
+; RV64ZBB-NEXT:    or a0, a4, a0
+; RV64ZBB-NEXT:    srli a1, a0, 2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    slli a0, a0, 2
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 1
-; RV64ZBB-NEXT:    lui a2, 349525
-; RV64ZBB-NEXT:    addiw a2, a2, 1365
-; RV64ZBB-NEXT:    slli a3, a2, 32
-; RV64ZBB-NEXT:    add a2, a2, a3
-; RV64ZBB-NEXT:    and a1, a1, a2
-; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a0, a0, a3
+; RV64ZBB-NEXT:    and a1, a1, a3
 ; RV64ZBB-NEXT:    slli a0, a0, 1
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    ret
@@ -729,10 +726,9 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
 ; RV32ZBKB-LABEL: test_bitreverse_i64:
 ; RV32ZBKB:       # %bb.0:
 ; RV32ZBKB-NEXT:    rev8 a1, a1
-; RV32ZBKB-NEXT:    brev8 a2, a1
-; RV32ZBKB-NEXT:    rev8 a0, a0
-; RV32ZBKB-NEXT:    brev8 a1, a0
-; RV32ZBKB-NEXT:    mv a0, a2
+; RV32ZBKB-NEXT:    rev8 a2, a0
+; RV32ZBKB-NEXT:    brev8 a0, a1
+; RV32ZBKB-NEXT:    brev8 a1, a2
 ; RV32ZBKB-NEXT:    ret
 ;
 ; RV64ZBKB-LABEL: test_bitreverse_i64:
@@ -752,20 +748,20 @@ define i16 @test_bswap_bitreverse_i16(i16 %a) nounwind {
 ; RV32I-NEXT:    addi a2, a2, -241
 ; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    slli a0, a0, 4
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    lui a2, 3
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    addi a2, a2, 1365
 ; RV32I-NEXT:    slli a0, a0, 2
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 5
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -777,20 +773,20 @@ define i16 @test_bswap_bitreverse_i16(i16 %a) nounwind {
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 3
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    slli a0, a0, 4
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    lui a2, 3
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 5
+; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 5
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -802,20 +798,20 @@ define i16 @test_bswap_bitreverse_i16(i16 %a) nounwind {
 ; RV32ZBB-NEXT:    addi a2, a2, -241
 ; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    lui a2, 3
+; RV32ZBB-NEXT:    addi a2, a2, 819
 ; RV32ZBB-NEXT:    slli a0, a0, 4
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 2
-; RV32ZBB-NEXT:    lui a2, 3
-; RV32ZBB-NEXT:    addi a2, a2, 819
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
+; RV32ZBB-NEXT:    lui a2, 5
+; RV32ZBB-NEXT:    addi a2, a2, 1365
 ; RV32ZBB-NEXT:    slli a0, a0, 2
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 1
-; RV32ZBB-NEXT:    lui a2, 5
-; RV32ZBB-NEXT:    addi a2, a2, 1365
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    slli a0, a0, 1
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    ret
@@ -827,20 +823,20 @@ define i16 @test_bswap_bitreverse_i16(i16 %a) nounwind {
 ; RV64ZBB-NEXT:    addiw a2, a2, -241
 ; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    lui a2, 3
+; RV64ZBB-NEXT:    addiw a2, a2, 819
 ; RV64ZBB-NEXT:    slli a0, a0, 4
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 2
-; RV64ZBB-NEXT:    lui a2, 3
-; RV64ZBB-NEXT:    addiw a2, a2, 819
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    lui a2, 5
+; RV64ZBB-NEXT:    addiw a2, a2, 1365
 ; RV64ZBB-NEXT:    slli a0, a0, 2
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 1
-; RV64ZBB-NEXT:    lui a2, 5
-; RV64ZBB-NEXT:    addiw a2, a2, 1365
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    slli a0, a0, 1
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    ret
@@ -867,20 +863,20 @@ define i32 @test_bswap_bitreverse_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    addi a2, a2, -241
 ; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    slli a0, a0, 4
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    addi a2, a2, 1365
 ; RV32I-NEXT:    slli a0, a0, 2
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -892,20 +888,20 @@ define i32 @test_bswap_bitreverse_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    slliw a0, a0, 4
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    slliw a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slliw a0, a0, 1
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -917,20 +913,20 @@ define i32 @test_bswap_bitreverse_i32(i32 %a) nounwind {
 ; RV32ZBB-NEXT:    addi a2, a2, -241
 ; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    lui a2, 209715
+; RV32ZBB-NEXT:    addi a2, a2, 819
 ; RV32ZBB-NEXT:    slli a0, a0, 4
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 2
-; RV32ZBB-NEXT:    lui a2, 209715
-; RV32ZBB-NEXT:    addi a2, a2, 819
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
+; RV32ZBB-NEXT:    lui a2, 349525
+; RV32ZBB-NEXT:    addi a2, a2, 1365
 ; RV32ZBB-NEXT:    slli a0, a0, 2
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 1
-; RV32ZBB-NEXT:    lui a2, 349525
-; RV32ZBB-NEXT:    addi a2, a2, 1365
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    slli a0, a0, 1
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    ret
@@ -942,20 +938,20 @@ define i32 @test_bswap_bitreverse_i32(i32 %a) nounwind {
 ; RV64ZBB-NEXT:    addiw a2, a2, -241
 ; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    lui a2, 209715
+; RV64ZBB-NEXT:    addiw a2, a2, 819
 ; RV64ZBB-NEXT:    slliw a0, a0, 4
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 2
-; RV64ZBB-NEXT:    lui a2, 209715
-; RV64ZBB-NEXT:    addiw a2, a2, 819
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    lui a2, 349525
+; RV64ZBB-NEXT:    addiw a2, a2, 1365
 ; RV64ZBB-NEXT:    slliw a0, a0, 2
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 1
-; RV64ZBB-NEXT:    lui a2, 349525
-; RV64ZBB-NEXT:    addiw a2, a2, 1365
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    slliw a0, a0, 1
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    ret
@@ -979,69 +975,69 @@ define i64 @test_bswap_bitreverse_i64(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 4
 ; RV32I-NEXT:    lui a3, 61681
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a1, 4
 ; RV32I-NEXT:    addi a3, a3, -241
 ; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    and a5, a5, a3
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    slli a1, a1, 4
 ; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a5, a1
 ; RV32I-NEXT:    srli a2, a0, 2
-; RV32I-NEXT:    lui a4, 209715
-; RV32I-NEXT:    addi a4, a4, 819
-; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    srli a5, a1, 2
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    slli a0, a0, 2
+; RV32I-NEXT:    and a4, a5, a4
+; RV32I-NEXT:    slli a1, a1, 2
 ; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a4, a1
 ; RV32I-NEXT:    srli a2, a0, 1
-; RV32I-NEXT:    lui a5, 349525
-; RV32I-NEXT:    addi a5, a5, 1365
-; RV32I-NEXT:    and a2, a2, a5
-; RV32I-NEXT:    and a0, a0, a5
-; RV32I-NEXT:    slli a0, a0, 1
-; RV32I-NEXT:    or a0, a2, a0
-; RV32I-NEXT:    srli a2, a1, 4
-; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    srli a4, a1, 1
 ; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    slli a1, a1, 4
-; RV32I-NEXT:    or a1, a2, a1
-; RV32I-NEXT:    srli a2, a1, 2
-; RV32I-NEXT:    and a2, a2, a4
-; RV32I-NEXT:    and a1, a1, a4
-; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    or a1, a2, a1
-; RV32I-NEXT:    srli a2, a1, 1
-; RV32I-NEXT:    and a2, a2, a5
-; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    and a3, a4, a3
 ; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a3, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_bswap_bitreverse_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    addiw a2, a2, -241
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    slli a0, a0, 4
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 349525
+; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    addiw a3, a3, 1365
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    srli a4, a0, 4
+; RV64I-NEXT:    and a4, a4, a1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    or a0, a4, a0
+; RV64I-NEXT:    srli a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    and a1, a1, a3
 ; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -1050,69 +1046,69 @@ define i64 @test_bswap_bitreverse_i64(i64 %a) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    srli a2, a0, 4
 ; RV32ZBB-NEXT:    lui a3, 61681
+; RV32ZBB-NEXT:    lui a4, 209715
+; RV32ZBB-NEXT:    srli a5, a1, 4
 ; RV32ZBB-NEXT:    addi a3, a3, -241
 ; RV32ZBB-NEXT:    and a2, a2, a3
 ; RV32ZBB-NEXT:    and a0, a0, a3
+; RV32ZBB-NEXT:    and a5, a5, a3
+; RV32ZBB-NEXT:    and a1, a1, a3
+; RV32ZBB-NEXT:    lui a3, 349525
+; RV32ZBB-NEXT:    addi a4, a4, 819
+; RV32ZBB-NEXT:    addi a3, a3, 1365
 ; RV32ZBB-NEXT:    slli a0, a0, 4
+; RV32ZBB-NEXT:    slli a1, a1, 4
 ; RV32ZBB-NEXT:    or a0, a2, a0
+; RV32ZBB-NEXT:    or a1, a5, a1
 ; RV32ZBB-NEXT:    srli a2, a0, 2
-; RV32ZBB-NEXT:    lui a4, 209715
-; RV32ZBB-NEXT:    addi a4, a4, 819
-; RV32ZBB-NEXT:    and a2, a2, a4
 ; RV32ZBB-NEXT:    and a0, a0, a4
+; RV32ZBB-NEXT:    srli a5, a1, 2
+; RV32ZBB-NEXT:    and a1, a1, a4
+; RV32ZBB-NEXT:    and a2, a2, a4
 ; RV32ZBB-NEXT:    slli a0, a0, 2
+; RV32ZBB-NEXT:    and a4, a5, a4
+; RV32ZBB-NEXT:    slli a1, a1, 2
 ; RV32ZBB-NEXT:    or a0, a2, a0
+; RV32ZBB-NEXT:    or a1, a4, a1
 ; RV32ZBB-NEXT:    srli a2, a0, 1
-; RV32ZBB-NEXT:    lui a5, 349525
-; RV32ZBB-NEXT:    addi a5, a5, 1365
-; RV32ZBB-NEXT:    and a2, a2, a5
-; RV32ZBB-NEXT:    and a0, a0, a5
-; RV32ZBB-NEXT:    slli a0, a0, 1
-; RV32ZBB-NEXT:    or a0, a2, a0
-; RV32ZBB-NEXT:    srli a2, a1, 4
-; RV32ZBB-NEXT:    and a2, a2, a3
+; RV32ZBB-NEXT:    and a0, a0, a3
+; RV32ZBB-NEXT:    srli a4, a1, 1
 ; RV32ZBB-NEXT:    and a1, a1, a3
-; RV32ZBB-NEXT:    slli a1, a1, 4
-; RV32ZBB-NEXT:    or a1, a2, a1
-; RV32ZBB-NEXT:    srli a2, a1, 2
-; RV32ZBB-NEXT:    and a2, a2, a4
-; RV32ZBB-NEXT:    and a1, a1, a4
-; RV32ZBB-NEXT:    slli a1, a1, 2
-; RV32ZBB-NEXT:    or a1, a2, a1
-; RV32ZBB-NEXT:    srli a2, a1, 1
-; RV32ZBB-NEXT:    and a2, a2, a5
-; RV32ZBB-NEXT:    and a1, a1, a5
+; RV32ZBB-NEXT:    and a2, a2, a3
+; RV32ZBB-NEXT:    slli a0, a0, 1
+; RV32ZBB-NEXT:    and a3, a4, a3
 ; RV32ZBB-NEXT:    slli a1, a1, 1
-; RV32ZBB-NEXT:    or a1, a2, a1
+; RV32ZBB-NEXT:    or a0, a2, a0
+; RV32ZBB-NEXT:    or a1, a3, a1
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: test_bswap_bitreverse_i64:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    srli a1, a0, 4
-; RV64ZBB-NEXT:    lui a2, 61681
-; RV64ZBB-NEXT:    addiw a2, a2, -241
-; RV64ZBB-NEXT:    slli a3, a2, 32
-; RV64ZBB-NEXT:    add a2, a2, a3
-; RV64ZBB-NEXT:    and a1, a1, a2
-; RV64ZBB-NEXT:    and a0, a0, a2
-; RV64ZBB-NEXT:    slli a0, a0, 4
-; RV64ZBB-NEXT:    or a0, a1, a0
-; RV64ZBB-NEXT:    srli a1, a0, 2
+; RV64ZBB-NEXT:    lui a1, 61681
 ; RV64ZBB-NEXT:    lui a2, 209715
+; RV64ZBB-NEXT:    lui a3, 349525
+; RV64ZBB-NEXT:    addiw a1, a1, -241
 ; RV64ZBB-NEXT:    addiw a2, a2, 819
-; RV64ZBB-NEXT:    slli a3, a2, 32
-; RV64ZBB-NEXT:    add a2, a2, a3
-; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    addiw a3, a3, 1365
+; RV64ZBB-NEXT:    slli a4, a1, 32
+; RV64ZBB-NEXT:    add a1, a1, a4
+; RV64ZBB-NEXT:    slli a4, a2, 32
+; RV64ZBB-NEXT:    add a2, a2, a4
+; RV64ZBB-NEXT:    slli a4, a3, 32
+; RV64ZBB-NEXT:    add a3, a3, a4
+; RV64ZBB-NEXT:    srli a4, a0, 4
+; RV64ZBB-NEXT:    and a4, a4, a1
+; RV64ZBB-NEXT:    and a0, a0, a1
+; RV64ZBB-NEXT:    slli a0, a0, 4
+; RV64ZBB-NEXT:    or a0, a4, a0
+; RV64ZBB-NEXT:    srli a1, a0, 2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    slli a0, a0, 2
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 1
-; RV64ZBB-NEXT:    lui a2, 349525
-; RV64ZBB-NEXT:    addiw a2, a2, 1365
-; RV64ZBB-NEXT:    slli a3, a2, 32
-; RV64ZBB-NEXT:    add a2, a2, a3
-; RV64ZBB-NEXT:    and a1, a1, a2
-; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a0, a0, a3
+; RV64ZBB-NEXT:    and a1, a1, a3
 ; RV64ZBB-NEXT:    slli a0, a0, 1
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    ret
@@ -1140,20 +1136,20 @@ define i16 @test_bitreverse_bswap_i16(i16 %a) nounwind {
 ; RV32I-NEXT:    addi a2, a2, -241
 ; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    slli a0, a0, 4
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    lui a2, 3
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    addi a2, a2, 1365
 ; RV32I-NEXT:    slli a0, a0, 2
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 5
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -1165,20 +1161,20 @@ define i16 @test_bitreverse_bswap_i16(i16 %a) nounwind {
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 3
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    slli a0, a0, 4
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    lui a2, 3
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 5
+; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 5
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -1190,20 +1186,20 @@ define i16 @test_bitreverse_bswap_i16(i16 %a) nounwind {
 ; RV32ZBB-NEXT:    addi a2, a2, -241
 ; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    lui a2, 3
+; RV32ZBB-NEXT:    addi a2, a2, 819
 ; RV32ZBB-NEXT:    slli a0, a0, 4
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 2
-; RV32ZBB-NEXT:    lui a2, 3
-; RV32ZBB-NEXT:    addi a2, a2, 819
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
+; RV32ZBB-NEXT:    lui a2, 5
+; RV32ZBB-NEXT:    addi a2, a2, 1365
 ; RV32ZBB-NEXT:    slli a0, a0, 2
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 1
-; RV32ZBB-NEXT:    lui a2, 5
-; RV32ZBB-NEXT:    addi a2, a2, 1365
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    slli a0, a0, 1
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    ret
@@ -1215,20 +1211,20 @@ define i16 @test_bitreverse_bswap_i16(i16 %a) nounwind {
 ; RV64ZBB-NEXT:    addiw a2, a2, -241
 ; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    lui a2, 3
+; RV64ZBB-NEXT:    addiw a2, a2, 819
 ; RV64ZBB-NEXT:    slli a0, a0, 4
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 2
-; RV64ZBB-NEXT:    lui a2, 3
-; RV64ZBB-NEXT:    addiw a2, a2, 819
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    lui a2, 5
+; RV64ZBB-NEXT:    addiw a2, a2, 1365
 ; RV64ZBB-NEXT:    slli a0, a0, 2
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 1
-; RV64ZBB-NEXT:    lui a2, 5
-; RV64ZBB-NEXT:    addiw a2, a2, 1365
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    slli a0, a0, 1
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    ret
@@ -1255,20 +1251,20 @@ define i32 @test_bitreverse_bswap_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    addi a2, a2, -241
 ; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    slli a0, a0, 4
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    addi a2, a2, 1365
 ; RV32I-NEXT:    slli a0, a0, 2
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -1280,20 +1276,20 @@ define i32 @test_bitreverse_bswap_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    addiw a2, a2, -241
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    slliw a0, a0, 4
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    slliw a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slliw a0, a0, 1
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -1305,20 +1301,20 @@ define i32 @test_bitreverse_bswap_i32(i32 %a) nounwind {
 ; RV32ZBB-NEXT:    addi a2, a2, -241
 ; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    lui a2, 209715
+; RV32ZBB-NEXT:    addi a2, a2, 819
 ; RV32ZBB-NEXT:    slli a0, a0, 4
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 2
-; RV32ZBB-NEXT:    lui a2, 209715
-; RV32ZBB-NEXT:    addi a2, a2, 819
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
+; RV32ZBB-NEXT:    lui a2, 349525
+; RV32ZBB-NEXT:    addi a2, a2, 1365
 ; RV32ZBB-NEXT:    slli a0, a0, 2
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    srli a1, a0, 1
-; RV32ZBB-NEXT:    lui a2, 349525
-; RV32ZBB-NEXT:    addi a2, a2, 1365
-; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    and a0, a0, a2
+; RV32ZBB-NEXT:    and a1, a1, a2
 ; RV32ZBB-NEXT:    slli a0, a0, 1
 ; RV32ZBB-NEXT:    or a0, a1, a0
 ; RV32ZBB-NEXT:    ret
@@ -1330,20 +1326,20 @@ define i32 @test_bitreverse_bswap_i32(i32 %a) nounwind {
 ; RV64ZBB-NEXT:    addiw a2, a2, -241
 ; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    lui a2, 209715
+; RV64ZBB-NEXT:    addiw a2, a2, 819
 ; RV64ZBB-NEXT:    slliw a0, a0, 4
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 2
-; RV64ZBB-NEXT:    lui a2, 209715
-; RV64ZBB-NEXT:    addiw a2, a2, 819
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    lui a2, 349525
+; RV64ZBB-NEXT:    addiw a2, a2, 1365
 ; RV64ZBB-NEXT:    slliw a0, a0, 2
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 1
-; RV64ZBB-NEXT:    lui a2, 349525
-; RV64ZBB-NEXT:    addiw a2, a2, 1365
-; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    slliw a0, a0, 1
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    ret
@@ -1367,69 +1363,69 @@ define i64 @test_bitreverse_bswap_i64(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 4
 ; RV32I-NEXT:    lui a3, 61681
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a1, 4
 ; RV32I-NEXT:    addi a3, a3, -241
 ; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    and a5, a5, a3
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    slli a0, a0, 4
+; RV32I-NEXT:    slli a1, a1, 4
 ; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a5, a1
 ; RV32I-NEXT:    srli a2, a0, 2
-; RV32I-NEXT:    lui a4, 209715
-; RV32I-NEXT:    addi a4, a4, 819
-; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    srli a5, a1, 2
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    slli a0, a0, 2
+; RV32I-NEXT:    and a4, a5, a4
+; RV32I-NEXT:    slli a1, a1, 2
 ; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a4, a1
 ; RV32I-NEXT:    srli a2, a0, 1
-; RV32I-NEXT:    lui a5, 349525
-; RV32I-NEXT:    addi a5, a5, 1365
-; RV32I-NEXT:    and a2, a2, a5
-; RV32I-NEXT:    and a0, a0, a5
-; RV32I-NEXT:    slli a0, a0, 1
-; RV32I-NEXT:    or a0, a2, a0
-; RV32I-NEXT:    srli a2, a1, 4
-; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    srli a4, a1, 1
 ; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    slli a1, a1, 4
-; RV32I-NEXT:    or a1, a2, a1
-; RV32I-NEXT:    srli a2, a1, 2
-; RV32I-NEXT:    and a2, a2, a4
-; RV32I-NEXT:    and a1, a1, a4
-; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    or a1, a2, a1
-; RV32I-NEXT:    srli a2, a1, 1
-; RV32I-NEXT:    and a2, a2, a5
-; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    and a3, a4, a3
 ; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    or a1, a3, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test_bitreverse_bswap_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    addiw a2, a2, -241
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    slli a0, a0, 4
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    lui a1, 61681
 ; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 349525
+; RV64I-NEXT:    addiw a1, a1, -241
 ; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    addiw a3, a3, 1365
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    srli a4, a0, 4
+; RV64I-NEXT:    and a4, a4, a1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    or a0, a4, a0
+; RV64I-NEXT:    srli a1, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 2
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    slli a3, a2, 32
-; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    and a1, a1, a3
 ; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    or a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -1438,69 +1434,69 @@ define i64 @test_bitreverse_bswap_i64(i64 %a) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    srli a2, a0, 4
 ; RV32ZBB-NEXT:    lui a3, 61681
+; RV32ZBB-NEXT:    lui a4, 209715
+; RV32ZBB-NEXT:    srli a5, a1, 4
 ; RV32ZBB-NEXT:    addi a3, a3, -241
 ; RV32ZBB-NEXT:    and a2, a2, a3
 ; RV32ZBB-NEXT:    and a0, a0, a3
+; RV32ZBB-NEXT:    and a5, a5, a3
+; RV32ZBB-NEXT:    and a1, a1, a3
+; RV32ZBB-NEXT:    lui a3, 349525
+; RV32ZBB-NEXT:    addi a4, a4, 819
+; RV32ZBB-NEXT:    addi a3, a3, 1365
 ; RV32ZBB-NEXT:    slli a0, a0, 4
+; RV32ZBB-NEXT:    slli a1, a1, 4
 ; RV32ZBB-NEXT:    or a0, a2, a0
+; RV32ZBB-NEXT:    or a1, a5, a1
 ; RV32ZBB-NEXT:    srli a2, a0, 2
-; RV32ZBB-NEXT:    lui a4, 209715
-; RV32ZBB-NEXT:    addi a4, a4, 819
-; RV32ZBB-NEXT:    and a2, a2, a4
 ; RV32ZBB-NEXT:    and a0, a0, a4
+; RV32ZBB-NEXT:    srli a5, a1, 2
+; RV32ZBB-NEXT:    and a1, a1, a4
+; RV32ZBB-NEXT:    and a2, a2, a4
 ; RV32ZBB-NEXT:    slli a0, a0, 2
+; RV32ZBB-NEXT:    and a4, a5, a4
+; RV32ZBB-NEXT:    slli a1, a1, 2
 ; RV32ZBB-NEXT:    or a0, a2, a0
+; RV32ZBB-NEXT:    or a1, a4, a1
 ; RV32ZBB-NEXT:    srli a2, a0, 1
-; RV32ZBB-NEXT:    lui a5, 349525
-; RV32ZBB-NEXT:    addi a5, a5, 1365
-; RV32ZBB-NEXT:    and a2, a2, a5
-; RV32ZBB-NEXT:    and a0, a0, a5
-; RV32ZBB-NEXT:    slli a0, a0, 1
-; RV32ZBB-NEXT:    or a0, a2, a0
-; RV32ZBB-NEXT:    srli a2, a1, 4
-; RV32ZBB-NEXT:    and a2, a2, a3
+; RV32ZBB-NEXT:    and a0, a0, a3
+; RV32ZBB-NEXT:    srli a4, a1, 1
 ; RV32ZBB-NEXT:    and a1, a1, a3
-; RV32ZBB-NEXT:    slli a1, a1, 4
-; RV32ZBB-NEXT:    or a1, a2, a1
-; RV32ZBB-NEXT:    srli a2, a1, 2
-; RV32ZBB-NEXT:    and a2, a2, a4
-; RV32ZBB-NEXT:    and a1, a1, a4
-; RV32ZBB-NEXT:    slli a1, a1, 2
-; RV32ZBB-NEXT:    or a1, a2, a1
-; RV32ZBB-NEXT:    srli a2, a1, 1
-; RV32ZBB-NEXT:    and a2, a2, a5
-; RV32ZBB-NEXT:    and a1, a1, a5
+; RV32ZBB-NEXT:    and a2, a2, a3
+; RV32ZBB-NEXT:    slli a0, a0, 1
+; RV32ZBB-NEXT:    and a3, a4, a3
 ; RV32ZBB-NEXT:    slli a1, a1, 1
-; RV32ZBB-NEXT:    or a1, a2, a1
+; RV32ZBB-NEXT:    or a0, a2, a0
+; RV32ZBB-NEXT:    or a1, a3, a1
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: test_bitreverse_bswap_i64:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    srli a1, a0, 4
-; RV64ZBB-NEXT:    lui a2, 61681
-; RV64ZBB-NEXT:    addiw a2, a2, -241
-; RV64ZBB-NEXT:    slli a3, a2, 32
-; RV64ZBB-NEXT:    add a2, a2, a3
-; RV64ZBB-NEXT:    and a1, a1, a2
-; RV64ZBB-NEXT:    and a0, a0, a2
-; RV64ZBB-NEXT:    slli a0, a0, 4
-; RV64ZBB-NEXT:    or a0, a1, a0
-; RV64ZBB-NEXT:    srli a1, a0, 2
+; RV64ZBB-NEXT:    lui a1, 61681
 ; RV64ZBB-NEXT:    lui a2, 209715
+; RV64ZBB-NEXT:    lui a3, 349525
+; RV64ZBB-NEXT:    addiw a1, a1, -241
 ; RV64ZBB-NEXT:    addiw a2, a2, 819
-; RV64ZBB-NEXT:    slli a3, a2, 32
-; RV64ZBB-NEXT:    add a2, a2, a3
-; RV64ZBB-NEXT:    and a1, a1, a2
+; RV64ZBB-NEXT:    addiw a3, a3, 1365
+; RV64ZBB-NEXT:    slli a4, a1, 32
+; RV64ZBB-NEXT:    add a1, a1, a4
+; RV64ZBB-NEXT:    slli a4, a2, 32
+; RV64ZBB-NEXT:    add a2, a2, a4
+; RV64ZBB-NEXT:    slli a4, a3, 32
+; RV64ZBB-NEXT:    add a3, a3, a4
+; RV64ZBB-NEXT:    srli a4, a0, 4
+; RV64ZBB-NEXT:    and a4, a4, a1
+; RV64ZBB-NEXT:    and a0, a0, a1
+; RV64ZBB-NEXT:    slli a0, a0, 4
+; RV64ZBB-NEXT:    or a0, a4, a0
+; RV64ZBB-NEXT:    srli a1, a0, 2
 ; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a1, a1, a2
 ; RV64ZBB-NEXT:    slli a0, a0, 2
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    srli a1, a0, 1
-; RV64ZBB-NEXT:    lui a2, 349525
-; RV64ZBB-NEXT:    addiw a2, a2, 1365
-; RV64ZBB-NEXT:    slli a3, a2, 32
-; RV64ZBB-NEXT:    add a2, a2, a3
-; RV64ZBB-NEXT:    and a1, a1, a2
-; RV64ZBB-NEXT:    and a0, a0, a2
+; RV64ZBB-NEXT:    and a0, a0, a3
+; RV64ZBB-NEXT:    and a1, a1, a3
 ; RV64ZBB-NEXT:    slli a0, a0, 1
 ; RV64ZBB-NEXT:    or a0, a1, a0
 ; RV64ZBB-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/calling-conv-half.ll b/llvm/test/CodeGen/RISCV/calling-conv-half.ll
index cccb69d2e6986a..541c9b4d40c7e1 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-half.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-half.ll
@@ -333,8 +333,7 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lui a0, 5
-; RV32I-NEXT:    addi t0, a0, -1792
+; RV32I-NEXT:    lui a7, 5
 ; RV32I-NEXT:    li a0, 1
 ; RV32I-NEXT:    li a1, 2
 ; RV32I-NEXT:    li a2, 3
@@ -342,6 +341,7 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV32I-NEXT:    li a4, 5
 ; RV32I-NEXT:    li a5, 6
 ; RV32I-NEXT:    li a6, 7
+; RV32I-NEXT:    addi t0, a7, -1792
 ; RV32I-NEXT:    li a7, 8
 ; RV32I-NEXT:    sw t0, 0(sp)
 ; RV32I-NEXT:    call callee_half_on_stack
@@ -353,8 +353,7 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lui a0, 5
-; RV64I-NEXT:    addiw t0, a0, -1792
+; RV64I-NEXT:    lui a7, 5
 ; RV64I-NEXT:    li a0, 1
 ; RV64I-NEXT:    li a1, 2
 ; RV64I-NEXT:    li a2, 3
@@ -362,6 +361,7 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV64I-NEXT:    li a4, 5
 ; RV64I-NEXT:    li a5, 6
 ; RV64I-NEXT:    li a6, 7
+; RV64I-NEXT:    addiw t0, a7, -1792
 ; RV64I-NEXT:    li a7, 8
 ; RV64I-NEXT:    sd t0, 0(sp)
 ; RV64I-NEXT:    call callee_half_on_stack
@@ -373,8 +373,7 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    addi sp, sp, -16
 ; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32IF-NEXT:    lui a0, 1048565
-; RV32IF-NEXT:    addi t0, a0, -1792
+; RV32IF-NEXT:    lui a7, 1048565
 ; RV32IF-NEXT:    li a0, 1
 ; RV32IF-NEXT:    li a1, 2
 ; RV32IF-NEXT:    li a2, 3
@@ -382,6 +381,7 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV32IF-NEXT:    li a4, 5
 ; RV32IF-NEXT:    li a5, 6
 ; RV32IF-NEXT:    li a6, 7
+; RV32IF-NEXT:    addi t0, a7, -1792
 ; RV32IF-NEXT:    li a7, 8
 ; RV32IF-NEXT:    sw t0, 0(sp)
 ; RV32IF-NEXT:    call callee_half_on_stack
@@ -393,8 +393,7 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV64IF:       # %bb.0:
 ; RV64IF-NEXT:    addi sp, sp, -16
 ; RV64IF-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64IF-NEXT:    lui a0, 1048565
-; RV64IF-NEXT:    addi t0, a0, -1792
+; RV64IF-NEXT:    lui a7, 1048565
 ; RV64IF-NEXT:    li a0, 1
 ; RV64IF-NEXT:    li a1, 2
 ; RV64IF-NEXT:    li a2, 3
@@ -402,6 +401,7 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV64IF-NEXT:    li a4, 5
 ; RV64IF-NEXT:    li a5, 6
 ; RV64IF-NEXT:    li a6, 7
+; RV64IF-NEXT:    addi t0, a7, -1792
 ; RV64IF-NEXT:    li a7, 8
 ; RV64IF-NEXT:    sw t0, 0(sp)
 ; RV64IF-NEXT:    call callee_half_on_stack
@@ -413,12 +413,12 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV32-ILP32F:       # %bb.0:
 ; RV32-ILP32F-NEXT:    addi sp, sp, -16
 ; RV32-ILP32F-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-ILP32F-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32-ILP32F-NEXT:    flw fa0, %lo(.LCPI3_0)(a0)
+; RV32-ILP32F-NEXT:    lui a4, %hi(.LCPI3_0)
 ; RV32-ILP32F-NEXT:    li a0, 1
 ; RV32-ILP32F-NEXT:    li a1, 2
 ; RV32-ILP32F-NEXT:    li a2, 3
 ; RV32-ILP32F-NEXT:    li a3, 4
+; RV32-ILP32F-NEXT:    flw fa0, %lo(.LCPI3_0)(a4)
 ; RV32-ILP32F-NEXT:    li a4, 5
 ; RV32-ILP32F-NEXT:    li a5, 6
 ; RV32-ILP32F-NEXT:    li a6, 7
@@ -432,12 +432,12 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV64-LP64F:       # %bb.0:
 ; RV64-LP64F-NEXT:    addi sp, sp, -16
 ; RV64-LP64F-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-LP64F-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV64-LP64F-NEXT:    flw fa0, %lo(.LCPI3_0)(a0)
+; RV64-LP64F-NEXT:    lui a4, %hi(.LCPI3_0)
 ; RV64-LP64F-NEXT:    li a0, 1
 ; RV64-LP64F-NEXT:    li a1, 2
 ; RV64-LP64F-NEXT:    li a2, 3
 ; RV64-LP64F-NEXT:    li a3, 4
+; RV64-LP64F-NEXT:    flw fa0, %lo(.LCPI3_0)(a4)
 ; RV64-LP64F-NEXT:    li a4, 5
 ; RV64-LP64F-NEXT:    li a5, 6
 ; RV64-LP64F-NEXT:    li a6, 7
@@ -451,12 +451,12 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV32-ILP32ZFHMIN:       # %bb.0:
 ; RV32-ILP32ZFHMIN-NEXT:    addi sp, sp, -16
 ; RV32-ILP32ZFHMIN-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-ILP32ZFHMIN-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV32-ILP32ZFHMIN-NEXT:    flh fa0, %lo(.LCPI3_0)(a0)
+; RV32-ILP32ZFHMIN-NEXT:    lui a4, %hi(.LCPI3_0)
 ; RV32-ILP32ZFHMIN-NEXT:    li a0, 1
 ; RV32-ILP32ZFHMIN-NEXT:    li a1, 2
 ; RV32-ILP32ZFHMIN-NEXT:    li a2, 3
 ; RV32-ILP32ZFHMIN-NEXT:    li a3, 4
+; RV32-ILP32ZFHMIN-NEXT:    flh fa0, %lo(.LCPI3_0)(a4)
 ; RV32-ILP32ZFHMIN-NEXT:    li a4, 5
 ; RV32-ILP32ZFHMIN-NEXT:    li a5, 6
 ; RV32-ILP32ZFHMIN-NEXT:    li a6, 7
@@ -470,12 +470,12 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV64-LP64ZFHMIN:       # %bb.0:
 ; RV64-LP64ZFHMIN-NEXT:    addi sp, sp, -16
 ; RV64-LP64ZFHMIN-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64-LP64ZFHMIN-NEXT:    lui a0, %hi(.LCPI3_0)
-; RV64-LP64ZFHMIN-NEXT:    flh fa0, %lo(.LCPI3_0)(a0)
+; RV64-LP64ZFHMIN-NEXT:    lui a4, %hi(.LCPI3_0)
 ; RV64-LP64ZFHMIN-NEXT:    li a0, 1
 ; RV64-LP64ZFHMIN-NEXT:    li a1, 2
 ; RV64-LP64ZFHMIN-NEXT:    li a2, 3
 ; RV64-LP64ZFHMIN-NEXT:    li a3, 4
+; RV64-LP64ZFHMIN-NEXT:    flh fa0, %lo(.LCPI3_0)(a4)
 ; RV64-LP64ZFHMIN-NEXT:    li a4, 5
 ; RV64-LP64ZFHMIN-NEXT:    li a5, 6
 ; RV64-LP64ZFHMIN-NEXT:    li a6, 7

diff  --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll
index e97a3bff32fac7..9387b7ef4c32ec 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll
@@ -145,43 +145,45 @@ define void @caller_aligned_stack() nounwind {
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -64
 ; RV32I-FPELIM-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32I-FPELIM-NEXT:    li a0, 18
-; RV32I-FPELIM-NEXT:    li a1, 17
-; RV32I-FPELIM-NEXT:    sw a1, 20(sp)
-; RV32I-FPELIM-NEXT:    sw a0, 24(sp)
-; RV32I-FPELIM-NEXT:    li a0, 16
-; RV32I-FPELIM-NEXT:    lui a1, 262236
-; RV32I-FPELIM-NEXT:    addi a1, a1, 655
-; RV32I-FPELIM-NEXT:    lui a2, 377487
-; RV32I-FPELIM-NEXT:    addi a2, a2, 1475
-; RV32I-FPELIM-NEXT:    li a3, 15
-; RV32I-FPELIM-NEXT:    sw a3, 0(sp)
-; RV32I-FPELIM-NEXT:    sw a2, 8(sp)
-; RV32I-FPELIM-NEXT:    sw a1, 12(sp)
-; RV32I-FPELIM-NEXT:    sw a0, 16(sp)
-; RV32I-FPELIM-NEXT:    lui a0, 262153
-; RV32I-FPELIM-NEXT:    addi t0, a0, 491
-; RV32I-FPELIM-NEXT:    lui a0, 545260
-; RV32I-FPELIM-NEXT:    addi t1, a0, -1967
-; RV32I-FPELIM-NEXT:    lui a0, 964690
-; RV32I-FPELIM-NEXT:    addi t2, a0, -328
-; RV32I-FPELIM-NEXT:    lui a0, 335544
-; RV32I-FPELIM-NEXT:    addi t3, a0, 1311
-; RV32I-FPELIM-NEXT:    lui a0, 688509
-; RV32I-FPELIM-NEXT:    addi a5, a0, -2048
+; RV32I-FPELIM-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32I-FPELIM-NEXT:    li a5, 18
+; RV32I-FPELIM-NEXT:    li a6, 17
+; RV32I-FPELIM-NEXT:    li a7, 16
+; RV32I-FPELIM-NEXT:    lui t0, 262236
+; RV32I-FPELIM-NEXT:    lui t1, 377487
+; RV32I-FPELIM-NEXT:    li t2, 15
+; RV32I-FPELIM-NEXT:    lui t3, 262153
+; RV32I-FPELIM-NEXT:    lui t4, 545260
+; RV32I-FPELIM-NEXT:    lui t5, 964690
+; RV32I-FPELIM-NEXT:    lui t6, 335544
+; RV32I-FPELIM-NEXT:    lui s0, 688509
 ; RV32I-FPELIM-NEXT:    li a0, 1
 ; RV32I-FPELIM-NEXT:    li a1, 11
 ; RV32I-FPELIM-NEXT:    addi a2, sp, 32
 ; RV32I-FPELIM-NEXT:    li a3, 12
 ; RV32I-FPELIM-NEXT:    li a4, 13
+; RV32I-FPELIM-NEXT:    sw a6, 20(sp)
+; RV32I-FPELIM-NEXT:    sw a5, 24(sp)
 ; RV32I-FPELIM-NEXT:    li a6, 4
+; RV32I-FPELIM-NEXT:    addi a5, t0, 655
+; RV32I-FPELIM-NEXT:    addi t0, t1, 1475
+; RV32I-FPELIM-NEXT:    sw t2, 0(sp)
+; RV32I-FPELIM-NEXT:    sw t0, 8(sp)
+; RV32I-FPELIM-NEXT:    sw a5, 12(sp)
+; RV32I-FPELIM-NEXT:    sw a7, 16(sp)
 ; RV32I-FPELIM-NEXT:    li a7, 14
+; RV32I-FPELIM-NEXT:    addi t0, t3, 491
+; RV32I-FPELIM-NEXT:    addi t1, t4, -1967
+; RV32I-FPELIM-NEXT:    addi t2, t5, -328
+; RV32I-FPELIM-NEXT:    addi t3, t6, 1311
+; RV32I-FPELIM-NEXT:    addi a5, s0, -2048
 ; RV32I-FPELIM-NEXT:    sw t3, 32(sp)
 ; RV32I-FPELIM-NEXT:    sw t2, 36(sp)
 ; RV32I-FPELIM-NEXT:    sw t1, 40(sp)
 ; RV32I-FPELIM-NEXT:    sw t0, 44(sp)
 ; RV32I-FPELIM-NEXT:    call callee_aligned_stack
 ; RV32I-FPELIM-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
+; RV32I-FPELIM-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
 ; RV32I-FPELIM-NEXT:    addi sp, sp, 64
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -190,38 +192,39 @@ define void @caller_aligned_stack() nounwind {
 ; RV32I-WITHFP-NEXT:    addi sp, sp, -64
 ; RV32I-WITHFP-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32I-WITHFP-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 64
-; RV32I-WITHFP-NEXT:    li a0, 18
-; RV32I-WITHFP-NEXT:    li a1, 17
-; RV32I-WITHFP-NEXT:    sw a1, 20(sp)
-; RV32I-WITHFP-NEXT:    sw a0, 24(sp)
-; RV32I-WITHFP-NEXT:    li a0, 16
-; RV32I-WITHFP-NEXT:    lui a1, 262236
-; RV32I-WITHFP-NEXT:    addi a1, a1, 655
-; RV32I-WITHFP-NEXT:    lui a2, 377487
-; RV32I-WITHFP-NEXT:    addi a2, a2, 1475
-; RV32I-WITHFP-NEXT:    li a3, 15
-; RV32I-WITHFP-NEXT:    sw a3, 0(sp)
-; RV32I-WITHFP-NEXT:    sw a2, 8(sp)
-; RV32I-WITHFP-NEXT:    sw a1, 12(sp)
-; RV32I-WITHFP-NEXT:    sw a0, 16(sp)
-; RV32I-WITHFP-NEXT:    lui a0, 262153
-; RV32I-WITHFP-NEXT:    addi t0, a0, 491
-; RV32I-WITHFP-NEXT:    lui a0, 545260
-; RV32I-WITHFP-NEXT:    addi t1, a0, -1967
-; RV32I-WITHFP-NEXT:    lui a0, 964690
-; RV32I-WITHFP-NEXT:    addi t2, a0, -328
-; RV32I-WITHFP-NEXT:    lui a0, 335544
-; RV32I-WITHFP-NEXT:    addi t3, a0, 1311
-; RV32I-WITHFP-NEXT:    lui a0, 688509
-; RV32I-WITHFP-NEXT:    addi a5, a0, -2048
+; RV32I-WITHFP-NEXT:    li a5, 18
+; RV32I-WITHFP-NEXT:    li a6, 17
+; RV32I-WITHFP-NEXT:    li a7, 16
+; RV32I-WITHFP-NEXT:    lui t0, 262236
+; RV32I-WITHFP-NEXT:    lui t1, 377487
+; RV32I-WITHFP-NEXT:    li t2, 15
+; RV32I-WITHFP-NEXT:    lui t3, 262153
+; RV32I-WITHFP-NEXT:    lui t4, 545260
+; RV32I-WITHFP-NEXT:    lui t5, 964690
+; RV32I-WITHFP-NEXT:    lui t6, 335544
+; RV32I-WITHFP-NEXT:    lui s1, 688509
 ; RV32I-WITHFP-NEXT:    li a0, 1
 ; RV32I-WITHFP-NEXT:    li a1, 11
 ; RV32I-WITHFP-NEXT:    addi a2, s0, -32
 ; RV32I-WITHFP-NEXT:    li a3, 12
 ; RV32I-WITHFP-NEXT:    li a4, 13
+; RV32I-WITHFP-NEXT:    sw a6, 20(sp)
+; RV32I-WITHFP-NEXT:    sw a5, 24(sp)
 ; RV32I-WITHFP-NEXT:    li a6, 4
+; RV32I-WITHFP-NEXT:    addi a5, t0, 655
+; RV32I-WITHFP-NEXT:    addi t0, t1, 1475
+; RV32I-WITHFP-NEXT:    sw t2, 0(sp)
+; RV32I-WITHFP-NEXT:    sw t0, 8(sp)
+; RV32I-WITHFP-NEXT:    sw a5, 12(sp)
+; RV32I-WITHFP-NEXT:    sw a7, 16(sp)
 ; RV32I-WITHFP-NEXT:    li a7, 14
+; RV32I-WITHFP-NEXT:    addi t0, t3, 491
+; RV32I-WITHFP-NEXT:    addi t1, t4, -1967
+; RV32I-WITHFP-NEXT:    addi t2, t5, -328
+; RV32I-WITHFP-NEXT:    addi t3, t6, 1311
+; RV32I-WITHFP-NEXT:    addi a5, s1, -2048
 ; RV32I-WITHFP-NEXT:    sw t3, -32(s0)
 ; RV32I-WITHFP-NEXT:    sw t2, -28(s0)
 ; RV32I-WITHFP-NEXT:    sw t1, -24(s0)
@@ -229,6 +232,7 @@ define void @caller_aligned_stack() nounwind {
 ; RV32I-WITHFP-NEXT:    call callee_aligned_stack
 ; RV32I-WITHFP-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
+; RV32I-WITHFP-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    addi sp, sp, 64
 ; RV32I-WITHFP-NEXT:    ret
   %1 = call i32 @callee_aligned_stack(i32 1, i32 11,

diff  --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
index 9e4c8a6e3320c2..18916dd69eb43a 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll
@@ -86,15 +86,15 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; RV32I-FPELIM-NEXT:    lw t1, 0(sp)
 ; RV32I-FPELIM-NEXT:    andi a0, a0, 255
 ; RV32I-FPELIM-NEXT:    slli a1, a1, 16
+; RV32I-FPELIM-NEXT:    xor a3, a3, a7
 ; RV32I-FPELIM-NEXT:    srli a1, a1, 16
 ; RV32I-FPELIM-NEXT:    add a0, a0, a2
 ; RV32I-FPELIM-NEXT:    add a0, a0, a1
-; RV32I-FPELIM-NEXT:    xor a1, a4, t1
-; RV32I-FPELIM-NEXT:    xor a2, a3, a7
-; RV32I-FPELIM-NEXT:    or a1, a2, a1
-; RV32I-FPELIM-NEXT:    seqz a1, a1
 ; RV32I-FPELIM-NEXT:    add a0, a0, a5
+; RV32I-FPELIM-NEXT:    xor a1, a4, t1
 ; RV32I-FPELIM-NEXT:    add a0, a0, a6
+; RV32I-FPELIM-NEXT:    or a1, a3, a1
+; RV32I-FPELIM-NEXT:    seqz a1, a1
 ; RV32I-FPELIM-NEXT:    add a0, a0, t0
 ; RV32I-FPELIM-NEXT:    add a0, a1, a0
 ; RV32I-FPELIM-NEXT:    ret
@@ -109,15 +109,15 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; RV32I-WITHFP-NEXT:    lw t1, 0(s0)
 ; RV32I-WITHFP-NEXT:    andi a0, a0, 255
 ; RV32I-WITHFP-NEXT:    slli a1, a1, 16
+; RV32I-WITHFP-NEXT:    xor a3, a3, a7
 ; RV32I-WITHFP-NEXT:    srli a1, a1, 16
 ; RV32I-WITHFP-NEXT:    add a0, a0, a2
 ; RV32I-WITHFP-NEXT:    add a0, a0, a1
-; RV32I-WITHFP-NEXT:    xor a1, a4, t1
-; RV32I-WITHFP-NEXT:    xor a2, a3, a7
-; RV32I-WITHFP-NEXT:    or a1, a2, a1
-; RV32I-WITHFP-NEXT:    seqz a1, a1
 ; RV32I-WITHFP-NEXT:    add a0, a0, a5
+; RV32I-WITHFP-NEXT:    xor a1, a4, t1
 ; RV32I-WITHFP-NEXT:    add a0, a0, a6
+; RV32I-WITHFP-NEXT:    or a1, a3, a1
+; RV32I-WITHFP-NEXT:    seqz a1, a1
 ; RV32I-WITHFP-NEXT:    add a0, a0, t0
 ; RV32I-WITHFP-NEXT:    add a0, a1, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -190,21 +190,21 @@ define i32 @caller_many_scalars() nounwind {
 define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-FPELIM-LABEL: callee_large_scalars:
 ; RV32I-FPELIM:       # %bb.0:
-; RV32I-FPELIM-NEXT:    lw a2, 0(a0)
-; RV32I-FPELIM-NEXT:    lw a3, 4(a0)
-; RV32I-FPELIM-NEXT:    lw a4, 12(a1)
+; RV32I-FPELIM-NEXT:    lw a2, 0(a1)
+; RV32I-FPELIM-NEXT:    lw a3, 4(a1)
+; RV32I-FPELIM-NEXT:    lw a4, 8(a1)
+; RV32I-FPELIM-NEXT:    lw a1, 12(a1)
 ; RV32I-FPELIM-NEXT:    lw a5, 12(a0)
-; RV32I-FPELIM-NEXT:    lw a6, 0(a1)
-; RV32I-FPELIM-NEXT:    lw a7, 4(a1)
-; RV32I-FPELIM-NEXT:    lw a1, 8(a1)
-; RV32I-FPELIM-NEXT:    lw a0, 8(a0)
-; RV32I-FPELIM-NEXT:    xor a4, a5, a4
-; RV32I-FPELIM-NEXT:    xor a3, a3, a7
-; RV32I-FPELIM-NEXT:    or a3, a3, a4
-; RV32I-FPELIM-NEXT:    xor a0, a0, a1
-; RV32I-FPELIM-NEXT:    xor a1, a2, a6
-; RV32I-FPELIM-NEXT:    or a0, a1, a0
-; RV32I-FPELIM-NEXT:    or a0, a0, a3
+; RV32I-FPELIM-NEXT:    lw a6, 4(a0)
+; RV32I-FPELIM-NEXT:    lw a7, 8(a0)
+; RV32I-FPELIM-NEXT:    lw a0, 0(a0)
+; RV32I-FPELIM-NEXT:    xor a1, a5, a1
+; RV32I-FPELIM-NEXT:    xor a3, a6, a3
+; RV32I-FPELIM-NEXT:    xor a4, a7, a4
+; RV32I-FPELIM-NEXT:    xor a0, a0, a2
+; RV32I-FPELIM-NEXT:    or a1, a3, a1
+; RV32I-FPELIM-NEXT:    or a0, a0, a4
+; RV32I-FPELIM-NEXT:    or a0, a0, a1
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -214,21 +214,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
-; RV32I-WITHFP-NEXT:    lw a2, 0(a0)
-; RV32I-WITHFP-NEXT:    lw a3, 4(a0)
-; RV32I-WITHFP-NEXT:    lw a4, 12(a1)
+; RV32I-WITHFP-NEXT:    lw a2, 0(a1)
+; RV32I-WITHFP-NEXT:    lw a3, 4(a1)
+; RV32I-WITHFP-NEXT:    lw a4, 8(a1)
+; RV32I-WITHFP-NEXT:    lw a1, 12(a1)
 ; RV32I-WITHFP-NEXT:    lw a5, 12(a0)
-; RV32I-WITHFP-NEXT:    lw a6, 0(a1)
-; RV32I-WITHFP-NEXT:    lw a7, 4(a1)
-; RV32I-WITHFP-NEXT:    lw a1, 8(a1)
-; RV32I-WITHFP-NEXT:    lw a0, 8(a0)
-; RV32I-WITHFP-NEXT:    xor a4, a5, a4
-; RV32I-WITHFP-NEXT:    xor a3, a3, a7
-; RV32I-WITHFP-NEXT:    or a3, a3, a4
-; RV32I-WITHFP-NEXT:    xor a0, a0, a1
-; RV32I-WITHFP-NEXT:    xor a1, a2, a6
-; RV32I-WITHFP-NEXT:    or a0, a1, a0
-; RV32I-WITHFP-NEXT:    or a0, a0, a3
+; RV32I-WITHFP-NEXT:    lw a6, 4(a0)
+; RV32I-WITHFP-NEXT:    lw a7, 8(a0)
+; RV32I-WITHFP-NEXT:    lw a0, 0(a0)
+; RV32I-WITHFP-NEXT:    xor a1, a5, a1
+; RV32I-WITHFP-NEXT:    xor a3, a6, a3
+; RV32I-WITHFP-NEXT:    xor a4, a7, a4
+; RV32I-WITHFP-NEXT:    xor a0, a0, a2
+; RV32I-WITHFP-NEXT:    or a1, a3, a1
+; RV32I-WITHFP-NEXT:    or a0, a0, a4
+; RV32I-WITHFP-NEXT:    or a0, a0, a1
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -245,13 +245,13 @@ define i32 @caller_large_scalars() nounwind {
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -48
 ; RV32I-FPELIM-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32I-FPELIM-NEXT:    lui a0, 524272
+; RV32I-FPELIM-NEXT:    lui a1, 524272
+; RV32I-FPELIM-NEXT:    li a2, 1
+; RV32I-FPELIM-NEXT:    addi a0, sp, 24
 ; RV32I-FPELIM-NEXT:    sw zero, 0(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 4(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 8(sp)
-; RV32I-FPELIM-NEXT:    sw a0, 12(sp)
-; RV32I-FPELIM-NEXT:    li a2, 1
-; RV32I-FPELIM-NEXT:    addi a0, sp, 24
+; RV32I-FPELIM-NEXT:    sw a1, 12(sp)
 ; RV32I-FPELIM-NEXT:    mv a1, sp
 ; RV32I-FPELIM-NEXT:    sw a2, 24(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 28(sp)
@@ -268,13 +268,13 @@ define i32 @caller_large_scalars() nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 48
-; RV32I-WITHFP-NEXT:    lui a0, 524272
+; RV32I-WITHFP-NEXT:    lui a1, 524272
+; RV32I-WITHFP-NEXT:    li a2, 1
+; RV32I-WITHFP-NEXT:    addi a0, s0, -24
 ; RV32I-WITHFP-NEXT:    sw zero, -48(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -44(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -40(s0)
-; RV32I-WITHFP-NEXT:    sw a0, -36(s0)
-; RV32I-WITHFP-NEXT:    li a2, 1
-; RV32I-WITHFP-NEXT:    addi a0, s0, -24
+; RV32I-WITHFP-NEXT:    sw a1, -36(s0)
 ; RV32I-WITHFP-NEXT:    addi a1, s0, -48
 ; RV32I-WITHFP-NEXT:    sw a2, -24(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -20(s0)
@@ -299,18 +299,18 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-FPELIM-NEXT:    lw a0, 4(sp)
 ; RV32I-FPELIM-NEXT:    lw a1, 0(a7)
 ; RV32I-FPELIM-NEXT:    lw a2, 4(a7)
-; RV32I-FPELIM-NEXT:    lw a3, 12(a0)
+; RV32I-FPELIM-NEXT:    lw a3, 8(a7)
 ; RV32I-FPELIM-NEXT:    lw a4, 12(a7)
-; RV32I-FPELIM-NEXT:    lw a5, 0(a0)
+; RV32I-FPELIM-NEXT:    lw a5, 12(a0)
 ; RV32I-FPELIM-NEXT:    lw a6, 4(a0)
-; RV32I-FPELIM-NEXT:    lw a0, 8(a0)
-; RV32I-FPELIM-NEXT:    lw a7, 8(a7)
-; RV32I-FPELIM-NEXT:    xor a3, a4, a3
+; RV32I-FPELIM-NEXT:    lw a7, 8(a0)
+; RV32I-FPELIM-NEXT:    lw a0, 0(a0)
+; RV32I-FPELIM-NEXT:    xor a4, a4, a5
 ; RV32I-FPELIM-NEXT:    xor a2, a2, a6
-; RV32I-FPELIM-NEXT:    or a2, a2, a3
-; RV32I-FPELIM-NEXT:    xor a0, a7, a0
-; RV32I-FPELIM-NEXT:    xor a1, a1, a5
-; RV32I-FPELIM-NEXT:    or a0, a1, a0
+; RV32I-FPELIM-NEXT:    xor a3, a3, a7
+; RV32I-FPELIM-NEXT:    xor a0, a1, a0
+; RV32I-FPELIM-NEXT:    or a2, a2, a4
+; RV32I-FPELIM-NEXT:    or a0, a0, a3
 ; RV32I-FPELIM-NEXT:    or a0, a0, a2
 ; RV32I-FPELIM-NEXT:    seqz a0, a0
 ; RV32I-FPELIM-NEXT:    ret
@@ -324,18 +324,18 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; RV32I-WITHFP-NEXT:    lw a0, 4(s0)
 ; RV32I-WITHFP-NEXT:    lw a1, 0(a7)
 ; RV32I-WITHFP-NEXT:    lw a2, 4(a7)
-; RV32I-WITHFP-NEXT:    lw a3, 12(a0)
+; RV32I-WITHFP-NEXT:    lw a3, 8(a7)
 ; RV32I-WITHFP-NEXT:    lw a4, 12(a7)
-; RV32I-WITHFP-NEXT:    lw a5, 0(a0)
+; RV32I-WITHFP-NEXT:    lw a5, 12(a0)
 ; RV32I-WITHFP-NEXT:    lw a6, 4(a0)
-; RV32I-WITHFP-NEXT:    lw a0, 8(a0)
-; RV32I-WITHFP-NEXT:    lw a7, 8(a7)
-; RV32I-WITHFP-NEXT:    xor a3, a4, a3
+; RV32I-WITHFP-NEXT:    lw a7, 8(a0)
+; RV32I-WITHFP-NEXT:    lw a0, 0(a0)
+; RV32I-WITHFP-NEXT:    xor a4, a4, a5
 ; RV32I-WITHFP-NEXT:    xor a2, a2, a6
-; RV32I-WITHFP-NEXT:    or a2, a2, a3
-; RV32I-WITHFP-NEXT:    xor a0, a7, a0
-; RV32I-WITHFP-NEXT:    xor a1, a1, a5
-; RV32I-WITHFP-NEXT:    or a0, a1, a0
+; RV32I-WITHFP-NEXT:    xor a3, a3, a7
+; RV32I-WITHFP-NEXT:    xor a0, a1, a0
+; RV32I-WITHFP-NEXT:    or a2, a2, a4
+; RV32I-WITHFP-NEXT:    or a0, a0, a3
 ; RV32I-WITHFP-NEXT:    or a0, a0, a2
 ; RV32I-WITHFP-NEXT:    seqz a0, a0
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -353,25 +353,25 @@ define i32 @caller_large_scalars_exhausted_regs() nounwind {
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -64
 ; RV32I-FPELIM-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32I-FPELIM-NEXT:    addi a0, sp, 16
-; RV32I-FPELIM-NEXT:    li a1, 9
-; RV32I-FPELIM-NEXT:    sw a1, 0(sp)
-; RV32I-FPELIM-NEXT:    sw a0, 4(sp)
-; RV32I-FPELIM-NEXT:    lui a0, 524272
-; RV32I-FPELIM-NEXT:    sw zero, 16(sp)
-; RV32I-FPELIM-NEXT:    sw zero, 20(sp)
-; RV32I-FPELIM-NEXT:    sw zero, 24(sp)
-; RV32I-FPELIM-NEXT:    sw a0, 28(sp)
-; RV32I-FPELIM-NEXT:    li t0, 8
+; RV32I-FPELIM-NEXT:    addi a6, sp, 16
+; RV32I-FPELIM-NEXT:    li a7, 9
+; RV32I-FPELIM-NEXT:    lui t0, 524272
+; RV32I-FPELIM-NEXT:    li t1, 8
 ; RV32I-FPELIM-NEXT:    li a0, 1
 ; RV32I-FPELIM-NEXT:    li a1, 2
 ; RV32I-FPELIM-NEXT:    li a2, 3
 ; RV32I-FPELIM-NEXT:    li a3, 4
 ; RV32I-FPELIM-NEXT:    li a4, 5
 ; RV32I-FPELIM-NEXT:    li a5, 6
+; RV32I-FPELIM-NEXT:    sw a7, 0(sp)
+; RV32I-FPELIM-NEXT:    sw a6, 4(sp)
 ; RV32I-FPELIM-NEXT:    li a6, 7
+; RV32I-FPELIM-NEXT:    sw zero, 16(sp)
+; RV32I-FPELIM-NEXT:    sw zero, 20(sp)
+; RV32I-FPELIM-NEXT:    sw zero, 24(sp)
+; RV32I-FPELIM-NEXT:    sw t0, 28(sp)
 ; RV32I-FPELIM-NEXT:    addi a7, sp, 40
-; RV32I-FPELIM-NEXT:    sw t0, 40(sp)
+; RV32I-FPELIM-NEXT:    sw t1, 40(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 44(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 48(sp)
 ; RV32I-FPELIM-NEXT:    sw zero, 52(sp)
@@ -386,25 +386,25 @@ define i32 @caller_large_scalars_exhausted_regs() nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 64
-; RV32I-WITHFP-NEXT:    addi a0, s0, -48
-; RV32I-WITHFP-NEXT:    li a1, 9
-; RV32I-WITHFP-NEXT:    sw a1, 0(sp)
-; RV32I-WITHFP-NEXT:    sw a0, 4(sp)
-; RV32I-WITHFP-NEXT:    lui a0, 524272
-; RV32I-WITHFP-NEXT:    sw zero, -48(s0)
-; RV32I-WITHFP-NEXT:    sw zero, -44(s0)
-; RV32I-WITHFP-NEXT:    sw zero, -40(s0)
-; RV32I-WITHFP-NEXT:    sw a0, -36(s0)
-; RV32I-WITHFP-NEXT:    li t0, 8
+; RV32I-WITHFP-NEXT:    addi a6, s0, -48
+; RV32I-WITHFP-NEXT:    li a7, 9
+; RV32I-WITHFP-NEXT:    lui t0, 524272
+; RV32I-WITHFP-NEXT:    li t1, 8
 ; RV32I-WITHFP-NEXT:    li a0, 1
 ; RV32I-WITHFP-NEXT:    li a1, 2
 ; RV32I-WITHFP-NEXT:    li a2, 3
 ; RV32I-WITHFP-NEXT:    li a3, 4
 ; RV32I-WITHFP-NEXT:    li a4, 5
 ; RV32I-WITHFP-NEXT:    li a5, 6
+; RV32I-WITHFP-NEXT:    sw a7, 0(sp)
+; RV32I-WITHFP-NEXT:    sw a6, 4(sp)
 ; RV32I-WITHFP-NEXT:    li a6, 7
+; RV32I-WITHFP-NEXT:    sw zero, -48(s0)
+; RV32I-WITHFP-NEXT:    sw zero, -44(s0)
+; RV32I-WITHFP-NEXT:    sw zero, -40(s0)
+; RV32I-WITHFP-NEXT:    sw t0, -36(s0)
 ; RV32I-WITHFP-NEXT:    addi a7, s0, -24
-; RV32I-WITHFP-NEXT:    sw t0, -24(s0)
+; RV32I-WITHFP-NEXT:    sw t1, -24(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -20(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -16(s0)
 ; RV32I-WITHFP-NEXT:    sw zero, -12(s0)
@@ -664,34 +664,34 @@ define void @caller_aligned_stack() nounwind {
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -64
 ; RV32I-FPELIM-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32I-FPELIM-NEXT:    li a0, 19
-; RV32I-FPELIM-NEXT:    li a1, 18
-; RV32I-FPELIM-NEXT:    sw a1, 20(sp)
-; RV32I-FPELIM-NEXT:    sw a0, 24(sp)
-; RV32I-FPELIM-NEXT:    li a0, 17
-; RV32I-FPELIM-NEXT:    li a1, 16
-; RV32I-FPELIM-NEXT:    li a2, 15
-; RV32I-FPELIM-NEXT:    sw a2, 0(sp)
-; RV32I-FPELIM-NEXT:    sw a1, 8(sp)
-; RV32I-FPELIM-NEXT:    sw zero, 12(sp)
-; RV32I-FPELIM-NEXT:    sw a0, 16(sp)
-; RV32I-FPELIM-NEXT:    lui a0, 262153
-; RV32I-FPELIM-NEXT:    addi t0, a0, 491
-; RV32I-FPELIM-NEXT:    lui a0, 545260
-; RV32I-FPELIM-NEXT:    addi t1, a0, -1967
-; RV32I-FPELIM-NEXT:    lui a0, 964690
-; RV32I-FPELIM-NEXT:    addi t2, a0, -328
-; RV32I-FPELIM-NEXT:    lui a0, 335544
-; RV32I-FPELIM-NEXT:    addi t3, a0, 1311
-; RV32I-FPELIM-NEXT:    lui a0, 688509
-; RV32I-FPELIM-NEXT:    addi a5, a0, -2048
+; RV32I-FPELIM-NEXT:    li a5, 19
+; RV32I-FPELIM-NEXT:    li a6, 18
+; RV32I-FPELIM-NEXT:    li a7, 17
+; RV32I-FPELIM-NEXT:    li t0, 16
+; RV32I-FPELIM-NEXT:    li t1, 15
+; RV32I-FPELIM-NEXT:    lui t2, 262153
+; RV32I-FPELIM-NEXT:    lui t3, 545260
+; RV32I-FPELIM-NEXT:    lui t4, 964690
+; RV32I-FPELIM-NEXT:    lui t5, 335544
+; RV32I-FPELIM-NEXT:    lui t6, 688509
 ; RV32I-FPELIM-NEXT:    li a0, 1
 ; RV32I-FPELIM-NEXT:    li a1, 11
 ; RV32I-FPELIM-NEXT:    addi a2, sp, 32
 ; RV32I-FPELIM-NEXT:    li a3, 12
 ; RV32I-FPELIM-NEXT:    li a4, 13
+; RV32I-FPELIM-NEXT:    sw a6, 20(sp)
+; RV32I-FPELIM-NEXT:    sw a5, 24(sp)
 ; RV32I-FPELIM-NEXT:    li a6, 4
+; RV32I-FPELIM-NEXT:    sw t1, 0(sp)
+; RV32I-FPELIM-NEXT:    sw t0, 8(sp)
+; RV32I-FPELIM-NEXT:    sw zero, 12(sp)
+; RV32I-FPELIM-NEXT:    sw a7, 16(sp)
 ; RV32I-FPELIM-NEXT:    li a7, 14
+; RV32I-FPELIM-NEXT:    addi t0, t2, 491
+; RV32I-FPELIM-NEXT:    addi t1, t3, -1967
+; RV32I-FPELIM-NEXT:    addi t2, t4, -328
+; RV32I-FPELIM-NEXT:    addi t3, t5, 1311
+; RV32I-FPELIM-NEXT:    addi a5, t6, -2048
 ; RV32I-FPELIM-NEXT:    sw t3, 32(sp)
 ; RV32I-FPELIM-NEXT:    sw t2, 36(sp)
 ; RV32I-FPELIM-NEXT:    sw t1, 40(sp)
@@ -707,34 +707,34 @@ define void @caller_aligned_stack() nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 64
-; RV32I-WITHFP-NEXT:    li a0, 19
-; RV32I-WITHFP-NEXT:    li a1, 18
-; RV32I-WITHFP-NEXT:    sw a1, 20(sp)
-; RV32I-WITHFP-NEXT:    sw a0, 24(sp)
-; RV32I-WITHFP-NEXT:    li a0, 17
-; RV32I-WITHFP-NEXT:    li a1, 16
-; RV32I-WITHFP-NEXT:    li a2, 15
-; RV32I-WITHFP-NEXT:    sw a2, 0(sp)
-; RV32I-WITHFP-NEXT:    sw a1, 8(sp)
-; RV32I-WITHFP-NEXT:    sw zero, 12(sp)
-; RV32I-WITHFP-NEXT:    sw a0, 16(sp)
-; RV32I-WITHFP-NEXT:    lui a0, 262153
-; RV32I-WITHFP-NEXT:    addi t0, a0, 491
-; RV32I-WITHFP-NEXT:    lui a0, 545260
-; RV32I-WITHFP-NEXT:    addi t1, a0, -1967
-; RV32I-WITHFP-NEXT:    lui a0, 964690
-; RV32I-WITHFP-NEXT:    addi t2, a0, -328
-; RV32I-WITHFP-NEXT:    lui a0, 335544
-; RV32I-WITHFP-NEXT:    addi t3, a0, 1311
-; RV32I-WITHFP-NEXT:    lui a0, 688509
-; RV32I-WITHFP-NEXT:    addi a5, a0, -2048
+; RV32I-WITHFP-NEXT:    li a5, 19
+; RV32I-WITHFP-NEXT:    li a6, 18
+; RV32I-WITHFP-NEXT:    li a7, 17
+; RV32I-WITHFP-NEXT:    li t0, 16
+; RV32I-WITHFP-NEXT:    li t1, 15
+; RV32I-WITHFP-NEXT:    lui t2, 262153
+; RV32I-WITHFP-NEXT:    lui t3, 545260
+; RV32I-WITHFP-NEXT:    lui t4, 964690
+; RV32I-WITHFP-NEXT:    lui t5, 335544
+; RV32I-WITHFP-NEXT:    lui t6, 688509
 ; RV32I-WITHFP-NEXT:    li a0, 1
 ; RV32I-WITHFP-NEXT:    li a1, 11
 ; RV32I-WITHFP-NEXT:    addi a2, s0, -32
 ; RV32I-WITHFP-NEXT:    li a3, 12
 ; RV32I-WITHFP-NEXT:    li a4, 13
+; RV32I-WITHFP-NEXT:    sw a6, 20(sp)
+; RV32I-WITHFP-NEXT:    sw a5, 24(sp)
 ; RV32I-WITHFP-NEXT:    li a6, 4
+; RV32I-WITHFP-NEXT:    sw t1, 0(sp)
+; RV32I-WITHFP-NEXT:    sw t0, 8(sp)
+; RV32I-WITHFP-NEXT:    sw zero, 12(sp)
+; RV32I-WITHFP-NEXT:    sw a7, 16(sp)
 ; RV32I-WITHFP-NEXT:    li a7, 14
+; RV32I-WITHFP-NEXT:    addi t0, t2, 491
+; RV32I-WITHFP-NEXT:    addi t1, t3, -1967
+; RV32I-WITHFP-NEXT:    addi t2, t4, -328
+; RV32I-WITHFP-NEXT:    addi t3, t5, 1311
+; RV32I-WITHFP-NEXT:    addi a5, t6, -2048
 ; RV32I-WITHFP-NEXT:    sw t3, -32(s0)
 ; RV32I-WITHFP-NEXT:    sw t2, -28(s0)
 ; RV32I-WITHFP-NEXT:    sw t1, -24(s0)

diff  --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll
index 1321413fbc57e8..7630d5b8f77ef4 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32d.ll
@@ -97,21 +97,21 @@ define i32 @caller_double_in_gpr_exhausted_fprs() nounwind {
 ; RV32-ILP32D-NEXT:    addi sp, sp, -16
 ; RV32-ILP32D-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI5_0)
+; RV32-ILP32D-NEXT:    lui a1, %hi(.LCPI5_1)
 ; RV32-ILP32D-NEXT:    fld fa0, %lo(.LCPI5_0)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI5_1)
-; RV32-ILP32D-NEXT:    fld fa1, %lo(.LCPI5_1)(a0)
 ; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI5_2)
+; RV32-ILP32D-NEXT:    fld fa1, %lo(.LCPI5_1)(a1)
+; RV32-ILP32D-NEXT:    lui a1, %hi(.LCPI5_3)
 ; RV32-ILP32D-NEXT:    fld fa2, %lo(.LCPI5_2)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI5_3)
-; RV32-ILP32D-NEXT:    fld fa3, %lo(.LCPI5_3)(a0)
 ; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI5_4)
+; RV32-ILP32D-NEXT:    fld fa3, %lo(.LCPI5_3)(a1)
+; RV32-ILP32D-NEXT:    lui a1, %hi(.LCPI5_5)
 ; RV32-ILP32D-NEXT:    fld fa4, %lo(.LCPI5_4)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI5_5)
-; RV32-ILP32D-NEXT:    fld fa5, %lo(.LCPI5_5)(a0)
 ; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI5_6)
+; RV32-ILP32D-NEXT:    fld fa5, %lo(.LCPI5_5)(a1)
+; RV32-ILP32D-NEXT:    lui a1, %hi(.LCPI5_7)
 ; RV32-ILP32D-NEXT:    fld fa6, %lo(.LCPI5_6)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI5_7)
-; RV32-ILP32D-NEXT:    fld fa7, %lo(.LCPI5_7)(a0)
+; RV32-ILP32D-NEXT:    fld fa7, %lo(.LCPI5_7)(a1)
 ; RV32-ILP32D-NEXT:    lui a1, 262688
 ; RV32-ILP32D-NEXT:    li a0, 0
 ; RV32-ILP32D-NEXT:    call callee_double_in_gpr_exhausted_fprs
@@ -149,20 +149,20 @@ define i32 @caller_double_in_gpr_and_stack_almost_exhausted_gprs_fprs() nounwind
 ; RV32-ILP32D-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-ILP32D-NEXT:    lui a1, 262816
 ; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI7_0)
+; RV32-ILP32D-NEXT:    lui a2, %hi(.LCPI7_1)
+; RV32-ILP32D-NEXT:    lui a3, %hi(.LCPI7_2)
+; RV32-ILP32D-NEXT:    lui a4, %hi(.LCPI7_3)
+; RV32-ILP32D-NEXT:    lui a5, %hi(.LCPI7_4)
+; RV32-ILP32D-NEXT:    lui a6, %hi(.LCPI7_5)
+; RV32-ILP32D-NEXT:    lui a7, %hi(.LCPI7_6)
 ; RV32-ILP32D-NEXT:    fld fa0, %lo(.LCPI7_0)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI7_1)
-; RV32-ILP32D-NEXT:    fld fa1, %lo(.LCPI7_1)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI7_2)
-; RV32-ILP32D-NEXT:    fld fa2, %lo(.LCPI7_2)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI7_3)
-; RV32-ILP32D-NEXT:    fld fa3, %lo(.LCPI7_3)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI7_4)
-; RV32-ILP32D-NEXT:    fld fa4, %lo(.LCPI7_4)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI7_5)
-; RV32-ILP32D-NEXT:    fld fa5, %lo(.LCPI7_5)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI7_6)
-; RV32-ILP32D-NEXT:    fld fa6, %lo(.LCPI7_6)(a0)
 ; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI7_7)
+; RV32-ILP32D-NEXT:    fld fa1, %lo(.LCPI7_1)(a2)
+; RV32-ILP32D-NEXT:    fld fa2, %lo(.LCPI7_2)(a3)
+; RV32-ILP32D-NEXT:    fld fa3, %lo(.LCPI7_3)(a4)
+; RV32-ILP32D-NEXT:    fld fa4, %lo(.LCPI7_4)(a5)
+; RV32-ILP32D-NEXT:    fld fa5, %lo(.LCPI7_5)(a6)
+; RV32-ILP32D-NEXT:    fld fa6, %lo(.LCPI7_6)(a7)
 ; RV32-ILP32D-NEXT:    fld fa7, %lo(.LCPI7_7)(a0)
 ; RV32-ILP32D-NEXT:    li a0, 1
 ; RV32-ILP32D-NEXT:    li a2, 3
@@ -205,22 +205,22 @@ define i32 @caller_double_on_stack_exhausted_gprs_fprs() nounwind {
 ; RV32-ILP32D-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-ILP32D-NEXT:    lui a1, 262816
 ; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI9_0)
+; RV32-ILP32D-NEXT:    lui a2, %hi(.LCPI9_1)
+; RV32-ILP32D-NEXT:    lui a3, %hi(.LCPI9_2)
+; RV32-ILP32D-NEXT:    lui a4, %hi(.LCPI9_3)
+; RV32-ILP32D-NEXT:    lui a5, %hi(.LCPI9_4)
+; RV32-ILP32D-NEXT:    lui a6, %hi(.LCPI9_5)
+; RV32-ILP32D-NEXT:    lui a7, %hi(.LCPI9_6)
 ; RV32-ILP32D-NEXT:    fld fa0, %lo(.LCPI9_0)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI9_1)
-; RV32-ILP32D-NEXT:    fld fa1, %lo(.LCPI9_1)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI9_2)
-; RV32-ILP32D-NEXT:    fld fa2, %lo(.LCPI9_2)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI9_3)
-; RV32-ILP32D-NEXT:    fld fa3, %lo(.LCPI9_3)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI9_4)
-; RV32-ILP32D-NEXT:    fld fa4, %lo(.LCPI9_4)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI9_5)
-; RV32-ILP32D-NEXT:    fld fa5, %lo(.LCPI9_5)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI9_6)
-; RV32-ILP32D-NEXT:    fld fa6, %lo(.LCPI9_6)(a0)
-; RV32-ILP32D-NEXT:    lui a0, %hi(.LCPI9_7)
-; RV32-ILP32D-NEXT:    fld fa7, %lo(.LCPI9_7)(a0)
+; RV32-ILP32D-NEXT:    lui t0, %hi(.LCPI9_7)
+; RV32-ILP32D-NEXT:    fld fa1, %lo(.LCPI9_1)(a2)
 ; RV32-ILP32D-NEXT:    li a0, 1
+; RV32-ILP32D-NEXT:    fld fa2, %lo(.LCPI9_2)(a3)
+; RV32-ILP32D-NEXT:    fld fa3, %lo(.LCPI9_3)(a4)
+; RV32-ILP32D-NEXT:    fld fa4, %lo(.LCPI9_4)(a5)
+; RV32-ILP32D-NEXT:    fld fa5, %lo(.LCPI9_5)(a6)
+; RV32-ILP32D-NEXT:    fld fa6, %lo(.LCPI9_6)(a7)
+; RV32-ILP32D-NEXT:    fld fa7, %lo(.LCPI9_7)(t0)
 ; RV32-ILP32D-NEXT:    li a2, 3
 ; RV32-ILP32D-NEXT:    li a4, 5
 ; RV32-ILP32D-NEXT:    li a6, 7

diff  --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
index 2b779cd34a8072..e16bed5400300b 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32e.ll
@@ -694,39 +694,39 @@ define void @caller_aligned_stack() {
 ; ILP32E-FPELIM-NEXT:    addi s0, sp, 64
 ; ILP32E-FPELIM-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-FPELIM-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-NEXT:    li a0, 18
-; ILP32E-FPELIM-NEXT:    li a1, 17
-; ILP32E-FPELIM-NEXT:    li a2, 16
-; ILP32E-FPELIM-NEXT:    lui a3, 262236
-; ILP32E-FPELIM-NEXT:    addi a3, a3, 655
-; ILP32E-FPELIM-NEXT:    sw a3, 16(sp)
-; ILP32E-FPELIM-NEXT:    sw a2, 20(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a0, 28(sp)
-; ILP32E-FPELIM-NEXT:    lui a0, 377487
-; ILP32E-FPELIM-NEXT:    addi a0, a0, 1475
-; ILP32E-FPELIM-NEXT:    li a1, 15
-; ILP32E-FPELIM-NEXT:    li a2, 14
-; ILP32E-FPELIM-NEXT:    li a3, 4
-; ILP32E-FPELIM-NEXT:    sw a3, 0(sp)
-; ILP32E-FPELIM-NEXT:    sw a2, 4(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 8(sp)
-; ILP32E-FPELIM-NEXT:    sw a0, 12(sp)
-; ILP32E-FPELIM-NEXT:    lui a0, 262153
-; ILP32E-FPELIM-NEXT:    addi a6, a0, 491
-; ILP32E-FPELIM-NEXT:    lui a0, 545260
-; ILP32E-FPELIM-NEXT:    addi a7, a0, -1967
-; ILP32E-FPELIM-NEXT:    lui a0, 964690
-; ILP32E-FPELIM-NEXT:    addi t0, a0, -328
-; ILP32E-FPELIM-NEXT:    lui a0, 335544
-; ILP32E-FPELIM-NEXT:    addi t1, a0, 1311
-; ILP32E-FPELIM-NEXT:    lui a0, 688509
-; ILP32E-FPELIM-NEXT:    addi a5, a0, -2048
+; ILP32E-FPELIM-NEXT:    li a3, 18
+; ILP32E-FPELIM-NEXT:    li a4, 17
+; ILP32E-FPELIM-NEXT:    li a5, 16
+; ILP32E-FPELIM-NEXT:    lui a6, 262236
+; ILP32E-FPELIM-NEXT:    lui a7, 377487
+; ILP32E-FPELIM-NEXT:    li t0, 15
+; ILP32E-FPELIM-NEXT:    li t1, 14
+; ILP32E-FPELIM-NEXT:    li t2, 4
+; ILP32E-FPELIM-NEXT:    lui t3, 262153
+; ILP32E-FPELIM-NEXT:    lui t4, 545260
+; ILP32E-FPELIM-NEXT:    lui t5, 964690
+; ILP32E-FPELIM-NEXT:    lui t6, 335544
+; ILP32E-FPELIM-NEXT:    lui s2, 688509
 ; ILP32E-FPELIM-NEXT:    li a0, 1
 ; ILP32E-FPELIM-NEXT:    li a1, 11
 ; ILP32E-FPELIM-NEXT:    addi a2, sp, 32
+; ILP32E-FPELIM-NEXT:    addi a6, a6, 655
+; ILP32E-FPELIM-NEXT:    sw a6, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a5, 20(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 24(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 28(sp)
 ; ILP32E-FPELIM-NEXT:    li a3, 12
+; ILP32E-FPELIM-NEXT:    addi a4, a7, 1475
+; ILP32E-FPELIM-NEXT:    sw t2, 0(sp)
+; ILP32E-FPELIM-NEXT:    sw t1, 4(sp)
+; ILP32E-FPELIM-NEXT:    sw t0, 8(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 12(sp)
 ; ILP32E-FPELIM-NEXT:    li a4, 13
+; ILP32E-FPELIM-NEXT:    addi a6, t3, 491
+; ILP32E-FPELIM-NEXT:    addi a7, t4, -1967
+; ILP32E-FPELIM-NEXT:    addi t0, t5, -328
+; ILP32E-FPELIM-NEXT:    addi t1, t6, 1311
+; ILP32E-FPELIM-NEXT:    addi a5, s2, -2048
 ; ILP32E-FPELIM-NEXT:    sw t1, 32(sp)
 ; ILP32E-FPELIM-NEXT:    sw t0, 36(sp)
 ; ILP32E-FPELIM-NEXT:    sw a7, 40(sp)
@@ -753,39 +753,39 @@ define void @caller_aligned_stack() {
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 64
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    li a0, 18
-; ILP32E-WITHFP-NEXT:    li a1, 17
-; ILP32E-WITHFP-NEXT:    li a2, 16
-; ILP32E-WITHFP-NEXT:    lui a3, 262236
-; ILP32E-WITHFP-NEXT:    addi a3, a3, 655
-; ILP32E-WITHFP-NEXT:    sw a3, 16(sp)
-; ILP32E-WITHFP-NEXT:    sw a2, 20(sp)
-; ILP32E-WITHFP-NEXT:    sw a1, 24(sp)
-; ILP32E-WITHFP-NEXT:    sw a0, 28(sp)
-; ILP32E-WITHFP-NEXT:    lui a0, 377487
-; ILP32E-WITHFP-NEXT:    addi a0, a0, 1475
-; ILP32E-WITHFP-NEXT:    li a1, 15
-; ILP32E-WITHFP-NEXT:    li a2, 14
-; ILP32E-WITHFP-NEXT:    li a3, 4
-; ILP32E-WITHFP-NEXT:    sw a3, 0(sp)
-; ILP32E-WITHFP-NEXT:    sw a2, 4(sp)
-; ILP32E-WITHFP-NEXT:    sw a1, 8(sp)
-; ILP32E-WITHFP-NEXT:    sw a0, 12(sp)
-; ILP32E-WITHFP-NEXT:    lui a0, 262153
-; ILP32E-WITHFP-NEXT:    addi a6, a0, 491
-; ILP32E-WITHFP-NEXT:    lui a0, 545260
-; ILP32E-WITHFP-NEXT:    addi a7, a0, -1967
-; ILP32E-WITHFP-NEXT:    lui a0, 964690
-; ILP32E-WITHFP-NEXT:    addi t0, a0, -328
-; ILP32E-WITHFP-NEXT:    lui a0, 335544
-; ILP32E-WITHFP-NEXT:    addi t1, a0, 1311
-; ILP32E-WITHFP-NEXT:    lui a0, 688509
-; ILP32E-WITHFP-NEXT:    addi a5, a0, -2048
+; ILP32E-WITHFP-NEXT:    li a3, 18
+; ILP32E-WITHFP-NEXT:    li a4, 17
+; ILP32E-WITHFP-NEXT:    li a5, 16
+; ILP32E-WITHFP-NEXT:    lui a6, 262236
+; ILP32E-WITHFP-NEXT:    lui a7, 377487
+; ILP32E-WITHFP-NEXT:    li t0, 15
+; ILP32E-WITHFP-NEXT:    li t1, 14
+; ILP32E-WITHFP-NEXT:    li t2, 4
+; ILP32E-WITHFP-NEXT:    lui t3, 262153
+; ILP32E-WITHFP-NEXT:    lui t4, 545260
+; ILP32E-WITHFP-NEXT:    lui t5, 964690
+; ILP32E-WITHFP-NEXT:    lui t6, 335544
+; ILP32E-WITHFP-NEXT:    lui s2, 688509
 ; ILP32E-WITHFP-NEXT:    li a0, 1
 ; ILP32E-WITHFP-NEXT:    li a1, 11
 ; ILP32E-WITHFP-NEXT:    addi a2, sp, 32
+; ILP32E-WITHFP-NEXT:    addi a6, a6, 655
+; ILP32E-WITHFP-NEXT:    sw a6, 16(sp)
+; ILP32E-WITHFP-NEXT:    sw a5, 20(sp)
+; ILP32E-WITHFP-NEXT:    sw a4, 24(sp)
+; ILP32E-WITHFP-NEXT:    sw a3, 28(sp)
 ; ILP32E-WITHFP-NEXT:    li a3, 12
+; ILP32E-WITHFP-NEXT:    addi a4, a7, 1475
+; ILP32E-WITHFP-NEXT:    sw t2, 0(sp)
+; ILP32E-WITHFP-NEXT:    sw t1, 4(sp)
+; ILP32E-WITHFP-NEXT:    sw t0, 8(sp)
+; ILP32E-WITHFP-NEXT:    sw a4, 12(sp)
 ; ILP32E-WITHFP-NEXT:    li a4, 13
+; ILP32E-WITHFP-NEXT:    addi a6, t3, 491
+; ILP32E-WITHFP-NEXT:    addi a7, t4, -1967
+; ILP32E-WITHFP-NEXT:    addi t0, t5, -328
+; ILP32E-WITHFP-NEXT:    addi t1, t6, 1311
+; ILP32E-WITHFP-NEXT:    addi a5, s2, -2048
 ; ILP32E-WITHFP-NEXT:    sw t1, 32(sp)
 ; ILP32E-WITHFP-NEXT:    sw t0, 36(sp)
 ; ILP32E-WITHFP-NEXT:    sw a7, 40(sp)
@@ -812,39 +812,39 @@ define void @caller_aligned_stack() {
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi s0, sp, 64
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a0, 18
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a1, 17
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a2, 16
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a3, 262236
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a3, a3, 655
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a3, 16(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a2, 20(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a1, 24(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a0, 28(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a0, 377487
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a0, a0, 1475
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a1, 15
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a2, 14
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a3, 4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a3, 0(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a2, 4(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a1, 8(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a0, 12(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a0, 262153
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a6, a0, 491
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a0, 545260
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a7, a0, -1967
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a0, 964690
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi t0, a0, -328
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a0, 335544
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi t1, a0, 1311
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a0, 688509
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a5, a0, -2048
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a3, 18
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a4, 17
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a5, 16
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a6, 262236
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a7, 377487
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li t0, 15
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li t1, 14
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li t2, 4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t3, 262153
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t4, 545260
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t5, 964690
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t6, 335544
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui s2, 688509
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a0, 1
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a1, 11
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a2, sp, 32
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a6, a6, 655
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a6, 16(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a5, 20(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a4, 24(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a3, 28(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a3, 12
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a4, a7, 1475
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t2, 0(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t1, 4(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t0, 8(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a4, 12(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a4, 13
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a6, t3, 491
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a7, t4, -1967
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi t0, t5, -328
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi t1, t6, 1311
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a5, s2, -2048
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t1, 32(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t0, 36(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a7, 40(sp)
@@ -867,39 +867,39 @@ define void @caller_aligned_stack() {
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi s0, sp, 64
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a0, 18
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a1, 17
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a2, 16
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a3, 262236
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a3, a3, 655
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a3, 16(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a2, 20(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a1, 24(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a0, 28(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a0, 377487
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a0, a0, 1475
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a1, 15
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a2, 14
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a3, 4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a3, 0(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a2, 4(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a1, 8(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a0, 12(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a0, 262153
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a6, a0, 491
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a0, 545260
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a7, a0, -1967
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a0, 964690
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi t0, a0, -328
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a0, 335544
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi t1, a0, 1311
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a0, 688509
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a5, a0, -2048
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a3, 18
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a4, 17
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a5, 16
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a6, 262236
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a7, 377487
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li t0, 15
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li t1, 14
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li t2, 4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t3, 262153
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t4, 545260
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t5, 964690
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t6, 335544
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui s2, 688509
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a0, 1
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a1, 11
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a2, sp, 32
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a6, a6, 655
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a6, 16(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a5, 20(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a4, 24(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a3, 28(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a3, 12
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a4, a7, 1475
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t2, 0(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t1, 4(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t0, 8(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a4, 12(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a4, 13
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a6, t3, 491
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a7, t4, -1967
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi t0, t5, -328
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi t1, t6, 1311
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a5, s2, -2048
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t1, 32(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t0, 36(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a7, 40(sp)
@@ -1157,12 +1157,12 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; ILP32E-FPELIM-NEXT:    srli a1, a1, 16
 ; ILP32E-FPELIM-NEXT:    add a0, a0, a2
 ; ILP32E-FPELIM-NEXT:    add a0, a0, a1
+; ILP32E-FPELIM-NEXT:    add a0, a0, a5
 ; ILP32E-FPELIM-NEXT:    xor a1, a4, t1
 ; ILP32E-FPELIM-NEXT:    xor a2, a3, t0
+; ILP32E-FPELIM-NEXT:    add a0, a0, a7
 ; ILP32E-FPELIM-NEXT:    or a1, a2, a1
 ; ILP32E-FPELIM-NEXT:    seqz a1, a1
-; ILP32E-FPELIM-NEXT:    add a0, a0, a5
-; ILP32E-FPELIM-NEXT:    add a0, a0, a7
 ; ILP32E-FPELIM-NEXT:    add a0, a0, a6
 ; ILP32E-FPELIM-NEXT:    add a0, a1, a0
 ; ILP32E-FPELIM-NEXT:    ret
@@ -1186,12 +1186,12 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; ILP32E-WITHFP-NEXT:    srli a1, a1, 16
 ; ILP32E-WITHFP-NEXT:    add a0, a0, a2
 ; ILP32E-WITHFP-NEXT:    add a0, a0, a1
+; ILP32E-WITHFP-NEXT:    add a0, a0, a5
 ; ILP32E-WITHFP-NEXT:    xor a1, a4, t1
 ; ILP32E-WITHFP-NEXT:    xor a2, a3, t0
+; ILP32E-WITHFP-NEXT:    add a0, a0, a7
 ; ILP32E-WITHFP-NEXT:    or a1, a2, a1
 ; ILP32E-WITHFP-NEXT:    seqz a1, a1
-; ILP32E-WITHFP-NEXT:    add a0, a0, a5
-; ILP32E-WITHFP-NEXT:    add a0, a0, a7
 ; ILP32E-WITHFP-NEXT:    add a0, a0, a6
 ; ILP32E-WITHFP-NEXT:    add a0, a1, a0
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa sp, 8
@@ -1214,12 +1214,12 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    srli a1, a1, 16
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a0, a2
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a0, a1
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a0, a5
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a1, a4, t1
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a2, a3, t0
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a0, a7
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a1, a2, a1
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    seqz a1, a1
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a0, a5
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a0, a7
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a0, a6
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    add a0, a1, a0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    ret
@@ -1241,12 +1241,12 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i32 %e, i32 %f, i
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    srli a1, a1, 16
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a0, a2
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a0, a1
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a0, a5
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a1, a4, t1
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a2, a3, t0
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a0, a7
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a1, a2, a1
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    seqz a1, a1
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a0, a5
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a0, a7
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a0, a6
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    add a0, a1, a0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa sp, 8
@@ -1386,21 +1386,21 @@ define i32 @caller_many_scalars() {
 define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ; ILP32E-FPELIM-LABEL: callee_large_scalars:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    lw a2, 0(a0)
-; ILP32E-FPELIM-NEXT:    lw a3, 4(a0)
-; ILP32E-FPELIM-NEXT:    lw a4, 12(a1)
+; ILP32E-FPELIM-NEXT:    lw a2, 0(a1)
+; ILP32E-FPELIM-NEXT:    lw a3, 4(a1)
+; ILP32E-FPELIM-NEXT:    lw a4, 8(a1)
+; ILP32E-FPELIM-NEXT:    lw a1, 12(a1)
 ; ILP32E-FPELIM-NEXT:    lw a5, 12(a0)
-; ILP32E-FPELIM-NEXT:    lw a6, 0(a1)
-; ILP32E-FPELIM-NEXT:    lw a7, 4(a1)
-; ILP32E-FPELIM-NEXT:    lw a1, 8(a1)
-; ILP32E-FPELIM-NEXT:    lw a0, 8(a0)
-; ILP32E-FPELIM-NEXT:    xor a4, a5, a4
-; ILP32E-FPELIM-NEXT:    xor a3, a3, a7
-; ILP32E-FPELIM-NEXT:    or a3, a3, a4
-; ILP32E-FPELIM-NEXT:    xor a0, a0, a1
-; ILP32E-FPELIM-NEXT:    xor a1, a2, a6
-; ILP32E-FPELIM-NEXT:    or a0, a1, a0
-; ILP32E-FPELIM-NEXT:    or a0, a0, a3
+; ILP32E-FPELIM-NEXT:    lw a6, 4(a0)
+; ILP32E-FPELIM-NEXT:    lw a7, 8(a0)
+; ILP32E-FPELIM-NEXT:    lw a0, 0(a0)
+; ILP32E-FPELIM-NEXT:    xor a1, a5, a1
+; ILP32E-FPELIM-NEXT:    xor a3, a6, a3
+; ILP32E-FPELIM-NEXT:    xor a4, a7, a4
+; ILP32E-FPELIM-NEXT:    xor a0, a0, a2
+; ILP32E-FPELIM-NEXT:    or a1, a3, a1
+; ILP32E-FPELIM-NEXT:    or a0, a0, a4
+; ILP32E-FPELIM-NEXT:    or a0, a0, a1
 ; ILP32E-FPELIM-NEXT:    seqz a0, a0
 ; ILP32E-FPELIM-NEXT:    ret
 ;
@@ -1414,21 +1414,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
-; ILP32E-WITHFP-NEXT:    lw a2, 0(a0)
-; ILP32E-WITHFP-NEXT:    lw a3, 4(a0)
-; ILP32E-WITHFP-NEXT:    lw a4, 12(a1)
+; ILP32E-WITHFP-NEXT:    lw a2, 0(a1)
+; ILP32E-WITHFP-NEXT:    lw a3, 4(a1)
+; ILP32E-WITHFP-NEXT:    lw a4, 8(a1)
+; ILP32E-WITHFP-NEXT:    lw a1, 12(a1)
 ; ILP32E-WITHFP-NEXT:    lw a5, 12(a0)
-; ILP32E-WITHFP-NEXT:    lw a6, 0(a1)
-; ILP32E-WITHFP-NEXT:    lw a7, 4(a1)
-; ILP32E-WITHFP-NEXT:    lw a1, 8(a1)
-; ILP32E-WITHFP-NEXT:    lw a0, 8(a0)
-; ILP32E-WITHFP-NEXT:    xor a4, a5, a4
-; ILP32E-WITHFP-NEXT:    xor a3, a3, a7
-; ILP32E-WITHFP-NEXT:    or a3, a3, a4
-; ILP32E-WITHFP-NEXT:    xor a0, a0, a1
-; ILP32E-WITHFP-NEXT:    xor a1, a2, a6
-; ILP32E-WITHFP-NEXT:    or a0, a1, a0
-; ILP32E-WITHFP-NEXT:    or a0, a0, a3
+; ILP32E-WITHFP-NEXT:    lw a6, 4(a0)
+; ILP32E-WITHFP-NEXT:    lw a7, 8(a0)
+; ILP32E-WITHFP-NEXT:    lw a0, 0(a0)
+; ILP32E-WITHFP-NEXT:    xor a1, a5, a1
+; ILP32E-WITHFP-NEXT:    xor a3, a6, a3
+; ILP32E-WITHFP-NEXT:    xor a4, a7, a4
+; ILP32E-WITHFP-NEXT:    xor a0, a0, a2
+; ILP32E-WITHFP-NEXT:    or a1, a3, a1
+; ILP32E-WITHFP-NEXT:    or a0, a0, a4
+; ILP32E-WITHFP-NEXT:    or a0, a0, a1
 ; ILP32E-WITHFP-NEXT:    seqz a0, a0
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa sp, 8
 ; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
@@ -1441,21 +1441,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_scalars:
 ; ILP32E-FPELIM-SAVE-RESTORE:       # %bb.0:
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a2, 0(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a3, 4(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a4, 12(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a2, 0(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a3, 4(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a4, 8(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 12(a1)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a5, 12(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a6, 0(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a7, 4(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 8(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 8(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a4, a5, a4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a3, a3, a7
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a3, a3, a4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a0, a0, a1
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a1, a2, a6
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a1, a0
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a0, a3
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a6, 4(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a7, 8(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 0(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a1, a5, a1
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a3, a6, a3
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a4, a7, a4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a0, a0, a2
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a1, a3, a1
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a0, a4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a0, a1
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    seqz a0, a0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    ret
 ;
@@ -1467,21 +1467,21 @@ define i32 @callee_large_scalars(i128 %a, fp128 %b) {
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_offset s0, -8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a2, 0(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a3, 4(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a4, 12(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a2, 0(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a3, 4(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a4, 8(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 12(a1)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a5, 12(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a6, 0(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a7, 4(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 8(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 8(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a4, a5, a4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a3, a3, a7
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a3, a3, a4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a0, a0, a1
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a1, a2, a6
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a1, a0
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a0, a3
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a6, 4(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a7, 8(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 0(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a1, a5, a1
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a3, a6, a3
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a4, a7, a4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a0, a0, a2
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a1, a3, a1
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a0, a4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a0, a1
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    seqz a0, a0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa sp, 8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    tail __riscv_restore_1
@@ -1503,13 +1503,13 @@ define i32 @caller_large_scalars() {
 ; ILP32E-FPELIM-NEXT:    addi s0, sp, 48
 ; ILP32E-FPELIM-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-FPELIM-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-NEXT:    lui a0, 524272
+; ILP32E-FPELIM-NEXT:    lui a1, 524272
+; ILP32E-FPELIM-NEXT:    li a2, 1
+; ILP32E-FPELIM-NEXT:    addi a0, sp, 24
 ; ILP32E-FPELIM-NEXT:    sw zero, 0(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 4(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 8(sp)
-; ILP32E-FPELIM-NEXT:    sw a0, 12(sp)
-; ILP32E-FPELIM-NEXT:    li a2, 1
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 24
+; ILP32E-FPELIM-NEXT:    sw a1, 12(sp)
 ; ILP32E-FPELIM-NEXT:    mv a1, sp
 ; ILP32E-FPELIM-NEXT:    sw a2, 24(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 28(sp)
@@ -1537,13 +1537,13 @@ define i32 @caller_large_scalars() {
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 48
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    lui a0, 524272
+; ILP32E-WITHFP-NEXT:    lui a1, 524272
+; ILP32E-WITHFP-NEXT:    li a2, 1
+; ILP32E-WITHFP-NEXT:    addi a0, sp, 24
 ; ILP32E-WITHFP-NEXT:    sw zero, 0(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 4(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 8(sp)
-; ILP32E-WITHFP-NEXT:    sw a0, 12(sp)
-; ILP32E-WITHFP-NEXT:    li a2, 1
-; ILP32E-WITHFP-NEXT:    addi a0, sp, 24
+; ILP32E-WITHFP-NEXT:    sw a1, 12(sp)
 ; ILP32E-WITHFP-NEXT:    mv a1, sp
 ; ILP32E-WITHFP-NEXT:    sw a2, 24(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 28(sp)
@@ -1571,13 +1571,13 @@ define i32 @caller_large_scalars() {
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi s0, sp, 48
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a0, 524272
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a1, 524272
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a2, 1
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a0, sp, 24
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 0(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 4(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 8(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a0, 12(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a2, 1
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a0, sp, 24
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a1, 12(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    mv a1, sp
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a2, 24(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 28(sp)
@@ -1601,13 +1601,13 @@ define i32 @caller_large_scalars() {
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi s0, sp, 48
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a0, 524272
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a1, 524272
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a2, 1
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a0, sp, 24
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 0(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 4(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 8(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a0, 12(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a2, 1
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a0, sp, 24
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a1, 12(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    mv a1, sp
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a2, 24(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 28(sp)
@@ -1630,23 +1630,23 @@ define i32 @caller_large_scalars() {
 define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i128 %h, i32 %i, fp128 %j) {
 ; ILP32E-FPELIM-LABEL: callee_large_scalars_exhausted_regs:
 ; ILP32E-FPELIM:       # %bb.0:
-; ILP32E-FPELIM-NEXT:    lw a0, 4(sp)
-; ILP32E-FPELIM-NEXT:    lw a1, 12(sp)
+; ILP32E-FPELIM-NEXT:    lw a0, 12(sp)
+; ILP32E-FPELIM-NEXT:    lw a1, 4(sp)
 ; ILP32E-FPELIM-NEXT:    lw a2, 0(a0)
 ; ILP32E-FPELIM-NEXT:    lw a3, 4(a0)
-; ILP32E-FPELIM-NEXT:    lw a4, 12(a1)
-; ILP32E-FPELIM-NEXT:    lw a5, 12(a0)
-; ILP32E-FPELIM-NEXT:    lw a6, 0(a1)
-; ILP32E-FPELIM-NEXT:    lw a7, 4(a1)
-; ILP32E-FPELIM-NEXT:    lw a1, 8(a1)
-; ILP32E-FPELIM-NEXT:    lw a0, 8(a0)
-; ILP32E-FPELIM-NEXT:    xor a4, a5, a4
-; ILP32E-FPELIM-NEXT:    xor a3, a3, a7
-; ILP32E-FPELIM-NEXT:    or a3, a3, a4
-; ILP32E-FPELIM-NEXT:    xor a0, a0, a1
-; ILP32E-FPELIM-NEXT:    xor a1, a2, a6
+; ILP32E-FPELIM-NEXT:    lw a4, 8(a0)
+; ILP32E-FPELIM-NEXT:    lw a0, 12(a0)
+; ILP32E-FPELIM-NEXT:    lw a5, 12(a1)
+; ILP32E-FPELIM-NEXT:    lw a6, 4(a1)
+; ILP32E-FPELIM-NEXT:    lw a7, 8(a1)
+; ILP32E-FPELIM-NEXT:    lw a1, 0(a1)
+; ILP32E-FPELIM-NEXT:    xor a0, a5, a0
+; ILP32E-FPELIM-NEXT:    xor a3, a6, a3
+; ILP32E-FPELIM-NEXT:    xor a4, a7, a4
+; ILP32E-FPELIM-NEXT:    xor a1, a1, a2
+; ILP32E-FPELIM-NEXT:    or a0, a3, a0
+; ILP32E-FPELIM-NEXT:    or a1, a1, a4
 ; ILP32E-FPELIM-NEXT:    or a0, a1, a0
-; ILP32E-FPELIM-NEXT:    or a0, a0, a3
 ; ILP32E-FPELIM-NEXT:    seqz a0, a0
 ; ILP32E-FPELIM-NEXT:    ret
 ;
@@ -1660,23 +1660,23 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; ILP32E-WITHFP-NEXT:    .cfi_offset s0, -8
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
-; ILP32E-WITHFP-NEXT:    lw a0, 4(s0)
-; ILP32E-WITHFP-NEXT:    lw a1, 12(s0)
+; ILP32E-WITHFP-NEXT:    lw a0, 12(s0)
+; ILP32E-WITHFP-NEXT:    lw a1, 4(s0)
 ; ILP32E-WITHFP-NEXT:    lw a2, 0(a0)
 ; ILP32E-WITHFP-NEXT:    lw a3, 4(a0)
-; ILP32E-WITHFP-NEXT:    lw a4, 12(a1)
-; ILP32E-WITHFP-NEXT:    lw a5, 12(a0)
-; ILP32E-WITHFP-NEXT:    lw a6, 0(a1)
-; ILP32E-WITHFP-NEXT:    lw a7, 4(a1)
-; ILP32E-WITHFP-NEXT:    lw a1, 8(a1)
-; ILP32E-WITHFP-NEXT:    lw a0, 8(a0)
-; ILP32E-WITHFP-NEXT:    xor a4, a5, a4
-; ILP32E-WITHFP-NEXT:    xor a3, a3, a7
-; ILP32E-WITHFP-NEXT:    or a3, a3, a4
-; ILP32E-WITHFP-NEXT:    xor a0, a0, a1
-; ILP32E-WITHFP-NEXT:    xor a1, a2, a6
+; ILP32E-WITHFP-NEXT:    lw a4, 8(a0)
+; ILP32E-WITHFP-NEXT:    lw a0, 12(a0)
+; ILP32E-WITHFP-NEXT:    lw a5, 12(a1)
+; ILP32E-WITHFP-NEXT:    lw a6, 4(a1)
+; ILP32E-WITHFP-NEXT:    lw a7, 8(a1)
+; ILP32E-WITHFP-NEXT:    lw a1, 0(a1)
+; ILP32E-WITHFP-NEXT:    xor a0, a5, a0
+; ILP32E-WITHFP-NEXT:    xor a3, a6, a3
+; ILP32E-WITHFP-NEXT:    xor a4, a7, a4
+; ILP32E-WITHFP-NEXT:    xor a1, a1, a2
+; ILP32E-WITHFP-NEXT:    or a0, a3, a0
+; ILP32E-WITHFP-NEXT:    or a1, a1, a4
 ; ILP32E-WITHFP-NEXT:    or a0, a1, a0
-; ILP32E-WITHFP-NEXT:    or a0, a0, a3
 ; ILP32E-WITHFP-NEXT:    seqz a0, a0
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa sp, 8
 ; ILP32E-WITHFP-NEXT:    lw ra, 4(sp) # 4-byte Folded Reload
@@ -1689,23 +1689,23 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ;
 ; ILP32E-FPELIM-SAVE-RESTORE-LABEL: callee_large_scalars_exhausted_regs:
 ; ILP32E-FPELIM-SAVE-RESTORE:       # %bb.0:
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 4(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 12(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 12(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 4(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a2, 0(a0)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a3, 4(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a4, 12(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a5, 12(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a6, 0(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a7, 4(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 8(a1)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 8(a0)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a4, a5, a4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a3, a3, a7
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a3, a3, a4
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a0, a0, a1
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a1, a2, a6
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a4, 8(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a0, 12(a0)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a5, 12(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a6, 4(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a7, 8(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lw a1, 0(a1)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a0, a5, a0
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a3, a6, a3
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a4, a7, a4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    xor a1, a1, a2
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a3, a0
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a1, a1, a4
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a1, a0
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    or a0, a0, a3
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    seqz a0, a0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    ret
 ;
@@ -1717,23 +1717,23 @@ define i32 @callee_large_scalars_exhausted_regs(i32 %a, i32 %b, i32 %c, i32 %d,
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_offset s0, -8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi s0, sp, 8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 4(s0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 12(s0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 12(s0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 4(s0)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a2, 0(a0)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a3, 4(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a4, 12(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a5, 12(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a6, 0(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a7, 4(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 8(a1)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 8(a0)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a4, a5, a4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a3, a3, a7
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a3, a3, a4
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a0, a0, a1
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a1, a2, a6
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a4, 8(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a0, 12(a0)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a5, 12(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a6, 4(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a7, 8(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lw a1, 0(a1)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a0, a5, a0
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a3, a6, a3
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a4, a7, a4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    xor a1, a1, a2
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a3, a0
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a1, a1, a4
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a1, a0
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    or a0, a0, a3
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    seqz a0, a0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa sp, 8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    tail __riscv_restore_1
@@ -1755,27 +1755,27 @@ define i32 @caller_large_scalars_exhausted_regs() {
 ; ILP32E-FPELIM-NEXT:    addi s0, sp, 64
 ; ILP32E-FPELIM-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-FPELIM-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 16
-; ILP32E-FPELIM-NEXT:    li a1, 9
-; ILP32E-FPELIM-NEXT:    addi a2, sp, 40
-; ILP32E-FPELIM-NEXT:    li a3, 7
-; ILP32E-FPELIM-NEXT:    sw a3, 0(sp)
-; ILP32E-FPELIM-NEXT:    sw a2, 4(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 8(sp)
-; ILP32E-FPELIM-NEXT:    sw a0, 12(sp)
-; ILP32E-FPELIM-NEXT:    lui a0, 524272
-; ILP32E-FPELIM-NEXT:    sw zero, 16(sp)
-; ILP32E-FPELIM-NEXT:    sw zero, 20(sp)
-; ILP32E-FPELIM-NEXT:    sw zero, 24(sp)
-; ILP32E-FPELIM-NEXT:    sw a0, 28(sp)
-; ILP32E-FPELIM-NEXT:    li a6, 8
+; ILP32E-FPELIM-NEXT:    addi a4, sp, 16
+; ILP32E-FPELIM-NEXT:    li a5, 9
+; ILP32E-FPELIM-NEXT:    addi a6, sp, 40
+; ILP32E-FPELIM-NEXT:    li a7, 7
+; ILP32E-FPELIM-NEXT:    lui t0, 524272
+; ILP32E-FPELIM-NEXT:    li t1, 8
 ; ILP32E-FPELIM-NEXT:    li a0, 1
 ; ILP32E-FPELIM-NEXT:    li a1, 2
 ; ILP32E-FPELIM-NEXT:    li a2, 3
 ; ILP32E-FPELIM-NEXT:    li a3, 4
+; ILP32E-FPELIM-NEXT:    sw a7, 0(sp)
+; ILP32E-FPELIM-NEXT:    sw a6, 4(sp)
+; ILP32E-FPELIM-NEXT:    sw a5, 8(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 12(sp)
 ; ILP32E-FPELIM-NEXT:    li a4, 5
+; ILP32E-FPELIM-NEXT:    sw zero, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw zero, 20(sp)
+; ILP32E-FPELIM-NEXT:    sw zero, 24(sp)
+; ILP32E-FPELIM-NEXT:    sw t0, 28(sp)
 ; ILP32E-FPELIM-NEXT:    li a5, 6
-; ILP32E-FPELIM-NEXT:    sw a6, 40(sp)
+; ILP32E-FPELIM-NEXT:    sw t1, 40(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 44(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 48(sp)
 ; ILP32E-FPELIM-NEXT:    sw zero, 52(sp)
@@ -1801,27 +1801,27 @@ define i32 @caller_large_scalars_exhausted_regs() {
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 64
 ; ILP32E-WITHFP-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    addi a0, sp, 16
-; ILP32E-WITHFP-NEXT:    li a1, 9
-; ILP32E-WITHFP-NEXT:    addi a2, sp, 40
-; ILP32E-WITHFP-NEXT:    li a3, 7
-; ILP32E-WITHFP-NEXT:    sw a3, 0(sp)
-; ILP32E-WITHFP-NEXT:    sw a2, 4(sp)
-; ILP32E-WITHFP-NEXT:    sw a1, 8(sp)
-; ILP32E-WITHFP-NEXT:    sw a0, 12(sp)
-; ILP32E-WITHFP-NEXT:    lui a0, 524272
-; ILP32E-WITHFP-NEXT:    sw zero, 16(sp)
-; ILP32E-WITHFP-NEXT:    sw zero, 20(sp)
-; ILP32E-WITHFP-NEXT:    sw zero, 24(sp)
-; ILP32E-WITHFP-NEXT:    sw a0, 28(sp)
-; ILP32E-WITHFP-NEXT:    li a6, 8
+; ILP32E-WITHFP-NEXT:    addi a4, sp, 16
+; ILP32E-WITHFP-NEXT:    li a5, 9
+; ILP32E-WITHFP-NEXT:    addi a6, sp, 40
+; ILP32E-WITHFP-NEXT:    li a7, 7
+; ILP32E-WITHFP-NEXT:    lui t0, 524272
+; ILP32E-WITHFP-NEXT:    li t1, 8
 ; ILP32E-WITHFP-NEXT:    li a0, 1
 ; ILP32E-WITHFP-NEXT:    li a1, 2
 ; ILP32E-WITHFP-NEXT:    li a2, 3
 ; ILP32E-WITHFP-NEXT:    li a3, 4
+; ILP32E-WITHFP-NEXT:    sw a7, 0(sp)
+; ILP32E-WITHFP-NEXT:    sw a6, 4(sp)
+; ILP32E-WITHFP-NEXT:    sw a5, 8(sp)
+; ILP32E-WITHFP-NEXT:    sw a4, 12(sp)
 ; ILP32E-WITHFP-NEXT:    li a4, 5
+; ILP32E-WITHFP-NEXT:    sw zero, 16(sp)
+; ILP32E-WITHFP-NEXT:    sw zero, 20(sp)
+; ILP32E-WITHFP-NEXT:    sw zero, 24(sp)
+; ILP32E-WITHFP-NEXT:    sw t0, 28(sp)
 ; ILP32E-WITHFP-NEXT:    li a5, 6
-; ILP32E-WITHFP-NEXT:    sw a6, 40(sp)
+; ILP32E-WITHFP-NEXT:    sw t1, 40(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 44(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 48(sp)
 ; ILP32E-WITHFP-NEXT:    sw zero, 52(sp)
@@ -1847,27 +1847,27 @@ define i32 @caller_large_scalars_exhausted_regs() {
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi s0, sp, 64
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a0, sp, 16
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a1, 9
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a2, sp, 40
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a3, 7
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a3, 0(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a2, 4(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a1, 8(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a0, 12(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui a0, 524272
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 16(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 20(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 24(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a0, 28(sp)
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a6, 8
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a4, sp, 16
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a5, 9
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    addi a6, sp, 40
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a7, 7
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    lui t0, 524272
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li t1, 8
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a0, 1
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a1, 2
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a2, 3
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a3, 4
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a7, 0(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a6, 4(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a5, 8(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a4, 12(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a4, 5
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 16(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 20(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 24(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t0, 28(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    li a5, 6
-; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw a6, 40(sp)
+; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw t1, 40(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 44(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 48(sp)
 ; ILP32E-FPELIM-SAVE-RESTORE-NEXT:    sw zero, 52(sp)
@@ -1889,27 +1889,27 @@ define i32 @caller_large_scalars_exhausted_regs() {
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi s0, sp, 64
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    .cfi_def_cfa s0, 0
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a0, sp, 16
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a1, 9
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a2, sp, 40
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a3, 7
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a3, 0(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a2, 4(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a1, 8(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a0, 12(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui a0, 524272
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 16(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 20(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 24(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a0, 28(sp)
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a6, 8
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a4, sp, 16
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a5, 9
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    addi a6, sp, 40
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a7, 7
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    lui t0, 524272
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li t1, 8
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a0, 1
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a1, 2
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a2, 3
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a3, 4
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a7, 0(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a6, 4(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a5, 8(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a4, 12(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a4, 5
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 16(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 20(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 24(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t0, 28(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    li a5, 6
-; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw a6, 40(sp)
+; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw t1, 40(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 44(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 48(sp)
 ; ILP32E-WITHFP-SAVE-RESTORE-NEXT:    sw zero, 52(sp)

diff  --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll
index b0d60a7aaa235f..dabd2a7ce9a73d 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32f-ilp32d-common.ll
@@ -55,11 +55,11 @@ define i32 @caller_float_in_fpr_exhausted_gprs() nounwind {
 ; RV32-ILP32FD-NEXT:    addi sp, sp, -16
 ; RV32-ILP32FD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-ILP32FD-NEXT:    li a1, 5
-; RV32-ILP32FD-NEXT:    lui a0, 265216
-; RV32-ILP32FD-NEXT:    fmv.w.x fa0, a0
+; RV32-ILP32FD-NEXT:    lui a3, 265216
 ; RV32-ILP32FD-NEXT:    li a0, 1
 ; RV32-ILP32FD-NEXT:    li a2, 2
 ; RV32-ILP32FD-NEXT:    li a4, 3
+; RV32-ILP32FD-NEXT:    fmv.w.x fa0, a3
 ; RV32-ILP32FD-NEXT:    li a6, 4
 ; RV32-ILP32FD-NEXT:    sw a1, 0(sp)
 ; RV32-ILP32FD-NEXT:    li a1, 0
@@ -96,21 +96,21 @@ define i32 @caller_float_in_gpr_exhausted_fprs() nounwind {
 ; RV32-ILP32FD-NEXT:    addi sp, sp, -16
 ; RV32-ILP32FD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-ILP32FD-NEXT:    lui a0, 260096
+; RV32-ILP32FD-NEXT:    lui a1, 262144
 ; RV32-ILP32FD-NEXT:    fmv.w.x fa0, a0
-; RV32-ILP32FD-NEXT:    lui a0, 262144
-; RV32-ILP32FD-NEXT:    fmv.w.x fa1, a0
 ; RV32-ILP32FD-NEXT:    lui a0, 263168
+; RV32-ILP32FD-NEXT:    fmv.w.x fa1, a1
+; RV32-ILP32FD-NEXT:    lui a1, 264192
 ; RV32-ILP32FD-NEXT:    fmv.w.x fa2, a0
-; RV32-ILP32FD-NEXT:    lui a0, 264192
-; RV32-ILP32FD-NEXT:    fmv.w.x fa3, a0
 ; RV32-ILP32FD-NEXT:    lui a0, 264704
+; RV32-ILP32FD-NEXT:    fmv.w.x fa3, a1
+; RV32-ILP32FD-NEXT:    lui a1, 265216
 ; RV32-ILP32FD-NEXT:    fmv.w.x fa4, a0
-; RV32-ILP32FD-NEXT:    lui a0, 265216
-; RV32-ILP32FD-NEXT:    fmv.w.x fa5, a0
 ; RV32-ILP32FD-NEXT:    lui a0, 265728
+; RV32-ILP32FD-NEXT:    fmv.w.x fa5, a1
+; RV32-ILP32FD-NEXT:    lui a1, 266240
 ; RV32-ILP32FD-NEXT:    fmv.w.x fa6, a0
-; RV32-ILP32FD-NEXT:    lui a0, 266240
-; RV32-ILP32FD-NEXT:    fmv.w.x fa7, a0
+; RV32-ILP32FD-NEXT:    fmv.w.x fa7, a1
 ; RV32-ILP32FD-NEXT:    lui a0, 266496
 ; RV32-ILP32FD-NEXT:    call callee_float_in_gpr_exhausted_fprs
 ; RV32-ILP32FD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -143,24 +143,24 @@ define i32 @caller_float_on_stack_exhausted_gprs_fprs() nounwind {
 ; RV32-ILP32FD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-ILP32FD-NEXT:    lui a1, 267520
 ; RV32-ILP32FD-NEXT:    lui a0, 262144
+; RV32-ILP32FD-NEXT:    lui a2, 264192
+; RV32-ILP32FD-NEXT:    lui a3, 265216
+; RV32-ILP32FD-NEXT:    lui a4, 266240
+; RV32-ILP32FD-NEXT:    lui a5, 266496
+; RV32-ILP32FD-NEXT:    lui a6, 266752
+; RV32-ILP32FD-NEXT:    lui a7, 267008
 ; RV32-ILP32FD-NEXT:    fmv.w.x fa0, a0
-; RV32-ILP32FD-NEXT:    lui a0, 264192
-; RV32-ILP32FD-NEXT:    fmv.w.x fa1, a0
-; RV32-ILP32FD-NEXT:    lui a0, 265216
-; RV32-ILP32FD-NEXT:    fmv.w.x fa2, a0
-; RV32-ILP32FD-NEXT:    lui a0, 266240
-; RV32-ILP32FD-NEXT:    fmv.w.x fa3, a0
-; RV32-ILP32FD-NEXT:    lui a0, 266496
-; RV32-ILP32FD-NEXT:    fmv.w.x fa4, a0
-; RV32-ILP32FD-NEXT:    lui a0, 266752
-; RV32-ILP32FD-NEXT:    fmv.w.x fa5, a0
-; RV32-ILP32FD-NEXT:    lui a0, 267008
-; RV32-ILP32FD-NEXT:    fmv.w.x fa6, a0
-; RV32-ILP32FD-NEXT:    lui a0, 267264
-; RV32-ILP32FD-NEXT:    fmv.w.x fa7, a0
+; RV32-ILP32FD-NEXT:    lui t0, 267264
+; RV32-ILP32FD-NEXT:    fmv.w.x fa1, a2
 ; RV32-ILP32FD-NEXT:    li a0, 1
+; RV32-ILP32FD-NEXT:    fmv.w.x fa2, a3
 ; RV32-ILP32FD-NEXT:    li a2, 3
+; RV32-ILP32FD-NEXT:    fmv.w.x fa3, a4
 ; RV32-ILP32FD-NEXT:    li a4, 5
+; RV32-ILP32FD-NEXT:    fmv.w.x fa4, a5
+; RV32-ILP32FD-NEXT:    fmv.w.x fa5, a6
+; RV32-ILP32FD-NEXT:    fmv.w.x fa6, a7
+; RV32-ILP32FD-NEXT:    fmv.w.x fa7, t0
 ; RV32-ILP32FD-NEXT:    li a6, 7
 ; RV32-ILP32FD-NEXT:    sw a1, 0(sp)
 ; RV32-ILP32FD-NEXT:    li a1, 0

diff  --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
index cbd2cef981d71f..746b71a08a30ba 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
@@ -52,15 +52,15 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i128 %d, i32 %e, i32 %f,
 ; RV64I-NEXT:    ld t1, 0(sp)
 ; RV64I-NEXT:    andi a0, a0, 255
 ; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    xor a3, a3, a7
 ; RV64I-NEXT:    srli a1, a1, 48
 ; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    xor a1, a4, t1
-; RV64I-NEXT:    xor a2, a3, a7
-; RV64I-NEXT:    or a1, a2, a1
-; RV64I-NEXT:    seqz a1, a1
 ; RV64I-NEXT:    add a0, a0, a5
+; RV64I-NEXT:    xor a1, a4, t1
 ; RV64I-NEXT:    add a0, a0, a6
+; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    seqz a1, a1
 ; RV64I-NEXT:    add a0, a0, t0
 ; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -106,21 +106,21 @@ define i32 @caller_many_scalars() nounwind {
 define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind {
 ; RV64I-LABEL: callee_large_scalars:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    ld a2, 0(a0)
-; RV64I-NEXT:    ld a3, 8(a0)
-; RV64I-NEXT:    ld a4, 24(a1)
+; RV64I-NEXT:    ld a2, 0(a1)
+; RV64I-NEXT:    ld a3, 8(a1)
+; RV64I-NEXT:    ld a4, 16(a1)
+; RV64I-NEXT:    ld a1, 24(a1)
 ; RV64I-NEXT:    ld a5, 24(a0)
-; RV64I-NEXT:    ld a6, 0(a1)
-; RV64I-NEXT:    ld a7, 8(a1)
-; RV64I-NEXT:    ld a1, 16(a1)
-; RV64I-NEXT:    ld a0, 16(a0)
-; RV64I-NEXT:    xor a4, a5, a4
-; RV64I-NEXT:    xor a3, a3, a7
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    xor a0, a0, a1
-; RV64I-NEXT:    xor a1, a2, a6
-; RV64I-NEXT:    or a0, a1, a0
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    ld a6, 8(a0)
+; RV64I-NEXT:    ld a7, 16(a0)
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    xor a1, a5, a1
+; RV64I-NEXT:    xor a3, a6, a3
+; RV64I-NEXT:    xor a4, a7, a4
+; RV64I-NEXT:    xor a0, a0, a2
+; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
   %1 = icmp eq i256 %a, %b
@@ -133,15 +133,15 @@ define i64 @caller_large_scalars() nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -80
 ; RV64I-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    li a0, 2
-; RV64I-NEXT:    sd a0, 0(sp)
+; RV64I-NEXT:    li a2, 2
+; RV64I-NEXT:    li a3, 1
+; RV64I-NEXT:    addi a0, sp, 32
+; RV64I-NEXT:    mv a1, sp
+; RV64I-NEXT:    sd a2, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
 ; RV64I-NEXT:    sd zero, 24(sp)
-; RV64I-NEXT:    li a2, 1
-; RV64I-NEXT:    addi a0, sp, 32
-; RV64I-NEXT:    mv a1, sp
-; RV64I-NEXT:    sd a2, 32(sp)
+; RV64I-NEXT:    sd a3, 32(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
 ; RV64I-NEXT:    sd zero, 48(sp)
 ; RV64I-NEXT:    sd zero, 56(sp)
@@ -163,18 +163,18 @@ define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d,
 ; RV64I-NEXT:    ld a0, 8(sp)
 ; RV64I-NEXT:    ld a1, 0(a7)
 ; RV64I-NEXT:    ld a2, 8(a7)
-; RV64I-NEXT:    ld a3, 24(a0)
+; RV64I-NEXT:    ld a3, 16(a7)
 ; RV64I-NEXT:    ld a4, 24(a7)
-; RV64I-NEXT:    ld a5, 0(a0)
+; RV64I-NEXT:    ld a5, 24(a0)
 ; RV64I-NEXT:    ld a6, 8(a0)
-; RV64I-NEXT:    ld a0, 16(a0)
-; RV64I-NEXT:    ld a7, 16(a7)
-; RV64I-NEXT:    xor a3, a4, a3
+; RV64I-NEXT:    ld a7, 16(a0)
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    xor a4, a4, a5
 ; RV64I-NEXT:    xor a2, a2, a6
-; RV64I-NEXT:    or a2, a2, a3
-; RV64I-NEXT:    xor a0, a7, a0
-; RV64I-NEXT:    xor a1, a1, a5
-; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    xor a3, a3, a7
+; RV64I-NEXT:    xor a0, a1, a0
+; RV64I-NEXT:    or a2, a2, a4
+; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    ret
@@ -188,16 +188,10 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -96
 ; RV64I-NEXT:    sd ra, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    addi a0, sp, 16
-; RV64I-NEXT:    li a1, 9
-; RV64I-NEXT:    sd a1, 0(sp)
-; RV64I-NEXT:    sd a0, 8(sp)
-; RV64I-NEXT:    li a0, 10
-; RV64I-NEXT:    sd a0, 16(sp)
-; RV64I-NEXT:    sd zero, 24(sp)
-; RV64I-NEXT:    sd zero, 32(sp)
-; RV64I-NEXT:    sd zero, 40(sp)
-; RV64I-NEXT:    li t0, 8
+; RV64I-NEXT:    addi a7, sp, 16
+; RV64I-NEXT:    li t0, 9
+; RV64I-NEXT:    li t1, 10
+; RV64I-NEXT:    li t2, 8
 ; RV64I-NEXT:    li a0, 1
 ; RV64I-NEXT:    li a1, 2
 ; RV64I-NEXT:    li a2, 3
@@ -205,8 +199,14 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind {
 ; RV64I-NEXT:    li a4, 5
 ; RV64I-NEXT:    li a5, 6
 ; RV64I-NEXT:    li a6, 7
+; RV64I-NEXT:    sd t0, 0(sp)
+; RV64I-NEXT:    sd a7, 8(sp)
 ; RV64I-NEXT:    addi a7, sp, 48
-; RV64I-NEXT:    sd t0, 48(sp)
+; RV64I-NEXT:    sd t1, 16(sp)
+; RV64I-NEXT:    sd zero, 24(sp)
+; RV64I-NEXT:    sd zero, 32(sp)
+; RV64I-NEXT:    sd zero, 40(sp)
+; RV64I-NEXT:    sd t2, 48(sp)
 ; RV64I-NEXT:    sd zero, 56(sp)
 ; RV64I-NEXT:    sd zero, 64(sp)
 ; RV64I-NEXT:    sd zero, 72(sp)
@@ -356,24 +356,24 @@ define void @caller_aligned_stack() nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi sp, sp, -64
 ; RV64I-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    li a0, 12
-; RV64I-NEXT:    li a1, 11
-; RV64I-NEXT:    sd a1, 40(sp)
-; RV64I-NEXT:    sd a0, 48(sp)
-; RV64I-NEXT:    li a6, 10
-; RV64I-NEXT:    li t0, 9
-; RV64I-NEXT:    li t1, 8
+; RV64I-NEXT:    li a6, 12
+; RV64I-NEXT:    li a7, 11
+; RV64I-NEXT:    li t0, 10
+; RV64I-NEXT:    li t1, 9
+; RV64I-NEXT:    li t2, 8
 ; RV64I-NEXT:    li a0, 1
 ; RV64I-NEXT:    li a1, 2
 ; RV64I-NEXT:    li a2, 3
 ; RV64I-NEXT:    li a3, 4
 ; RV64I-NEXT:    li a4, 5
 ; RV64I-NEXT:    li a5, 6
+; RV64I-NEXT:    sd a7, 40(sp)
+; RV64I-NEXT:    sd a6, 48(sp)
 ; RV64I-NEXT:    li a7, 7
-; RV64I-NEXT:    sd t1, 0(sp)
-; RV64I-NEXT:    sd t0, 16(sp)
+; RV64I-NEXT:    sd t2, 0(sp)
+; RV64I-NEXT:    sd t1, 16(sp)
 ; RV64I-NEXT:    sd zero, 24(sp)
-; RV64I-NEXT:    sd a6, 32(sp)
+; RV64I-NEXT:    sd t0, 32(sp)
 ; RV64I-NEXT:    li a6, 0
 ; RV64I-NEXT:    call callee_aligned_stack
 ; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
@@ -480,15 +480,15 @@ define void @callee_large_struct_ret(ptr noalias sret(%struct.large) %agg.result
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a1, 1
 ; RV64I-NEXT:    li a2, 2
+; RV64I-NEXT:    li a3, 3
+; RV64I-NEXT:    li a4, 4
 ; RV64I-NEXT:    sw a1, 0(a0)
 ; RV64I-NEXT:    sw zero, 4(a0)
 ; RV64I-NEXT:    sw a2, 8(a0)
 ; RV64I-NEXT:    sw zero, 12(a0)
-; RV64I-NEXT:    li a1, 3
-; RV64I-NEXT:    li a2, 4
-; RV64I-NEXT:    sw a1, 16(a0)
+; RV64I-NEXT:    sw a3, 16(a0)
 ; RV64I-NEXT:    sw zero, 20(a0)
-; RV64I-NEXT:    sw a2, 24(a0)
+; RV64I-NEXT:    sw a4, 24(a0)
 ; RV64I-NEXT:    sw zero, 28(a0)
 ; RV64I-NEXT:    ret
   store i64 1, ptr %agg.result, align 4

diff  --git a/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll b/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll
index b26bd7b889807a..6608874286e346 100644
--- a/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll
+++ b/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll
@@ -87,8 +87,8 @@ define signext i32 @test3(i32 signext %v, i32 signext %w, i32 signext %x, i32 si
 ; NOCMOV-NEXT:    seqz a4, a4
 ; NOCMOV-NEXT:    addi a4, a4, -1
 ; NOCMOV-NEXT:    and a1, a1, a4
-; NOCMOV-NEXT:    xor a0, a0, a1
 ; NOCMOV-NEXT:    and a3, a3, a4
+; NOCMOV-NEXT:    xor a0, a0, a1
 ; NOCMOV-NEXT:    xor a2, a2, a3
 ; NOCMOV-NEXT:    addw a0, a0, a2
 ; NOCMOV-NEXT:    ret
@@ -96,16 +96,16 @@ define signext i32 @test3(i32 signext %v, i32 signext %w, i32 signext %x, i32 si
 ; CMOV-LABEL: test3:
 ; CMOV:       # %bb.0:
 ; CMOV-NEXT:    xor a1, a1, a0
+; CMOV-NEXT:    xor a3, a3, a2
 ; CMOV-NEXT:    bnez a4, .LBB2_2
 ; CMOV-NEXT:  # %bb.1:
 ; CMOV-NEXT:    mv a1, a0
 ; CMOV-NEXT:  .LBB2_2:
-; CMOV-NEXT:    xor a0, a2, a3
 ; CMOV-NEXT:    bnez a4, .LBB2_4
 ; CMOV-NEXT:  # %bb.3:
-; CMOV-NEXT:    mv a0, a2
+; CMOV-NEXT:    mv a3, a2
 ; CMOV-NEXT:  .LBB2_4:
-; CMOV-NEXT:    addw a0, a0, a1
+; CMOV-NEXT:    addw a0, a1, a3
 ; CMOV-NEXT:    ret
 ;
 ; SHORT_FORWARD-LABEL: test3:

diff  --git a/llvm/test/CodeGen/RISCV/compress.ll b/llvm/test/CodeGen/RISCV/compress.ll
index 8fb520fac41ee0..c8803773d76306 100644
--- a/llvm/test/CodeGen/RISCV/compress.ll
+++ b/llvm/test/CodeGen/RISCV/compress.ll
@@ -32,9 +32,9 @@
 define i32 @simple_arith(i32 %a, i32 %b) #0 {
 ; RV32IC-LABEL: <simple_arith>:
 ; RV32IC:         addi a2, a0, 0x1
+; RV32IC-NEXT:    c.srai a1, 0x9
 ; RV32IC-NEXT:    c.andi a2, 0xb
 ; RV32IC-NEXT:    c.slli a2, 0x7
-; RV32IC-NEXT:    c.srai a1, 0x9
 ; RV32IC-NEXT:    sub a0, a1, a0
 ; RV32IC-NEXT:    c.add a0, a2
 ; RV32IC-NEXT:    c.jr ra

diff  --git a/llvm/test/CodeGen/RISCV/condbinops.ll b/llvm/test/CodeGen/RISCV/condbinops.ll
index 1a661fddacfa05..dc81c13bfb6a35 100644
--- a/llvm/test/CodeGen/RISCV/condbinops.ll
+++ b/llvm/test/CodeGen/RISCV/condbinops.ll
@@ -453,19 +453,19 @@ define i64 @shl64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND-LABEL: shl64:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a4, a4, 1
+; RV32ZICOND-NEXT:    srli a3, a0, 1
 ; RV32ZICOND-NEXT:    czero.eqz a2, a2, a4
-; RV32ZICOND-NEXT:    sll a3, a0, a2
+; RV32ZICOND-NEXT:    sll a0, a0, a2
 ; RV32ZICOND-NEXT:    addi a4, a2, -32
-; RV32ZICOND-NEXT:    slti a4, a4, 0
-; RV32ZICOND-NEXT:    czero.nez a5, a3, a4
 ; RV32ZICOND-NEXT:    sll a1, a1, a2
 ; RV32ZICOND-NEXT:    not a2, a2
-; RV32ZICOND-NEXT:    srli a0, a0, 1
-; RV32ZICOND-NEXT:    srl a0, a0, a2
-; RV32ZICOND-NEXT:    or a0, a1, a0
-; RV32ZICOND-NEXT:    czero.eqz a1, a0, a4
-; RV32ZICOND-NEXT:    or a1, a1, a5
-; RV32ZICOND-NEXT:    czero.eqz a0, a3, a4
+; RV32ZICOND-NEXT:    slti a4, a4, 0
+; RV32ZICOND-NEXT:    srl a2, a3, a2
+; RV32ZICOND-NEXT:    czero.nez a3, a0, a4
+; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, a4
+; RV32ZICOND-NEXT:    or a1, a1, a3
+; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: shl64:
@@ -527,22 +527,22 @@ define i64 @ashr64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND-LABEL: ashr64:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a4, a4, 1
+; RV32ZICOND-NEXT:    slli a3, a1, 1
+; RV32ZICOND-NEXT:    srai a5, a1, 31
 ; RV32ZICOND-NEXT:    czero.eqz a2, a2, a4
-; RV32ZICOND-NEXT:    sra a3, a1, a2
+; RV32ZICOND-NEXT:    sra a1, a1, a2
 ; RV32ZICOND-NEXT:    addi a4, a2, -32
-; RV32ZICOND-NEXT:    slti a4, a4, 0
-; RV32ZICOND-NEXT:    czero.nez a5, a3, a4
 ; RV32ZICOND-NEXT:    srl a0, a0, a2
 ; RV32ZICOND-NEXT:    not a2, a2
-; RV32ZICOND-NEXT:    slli a6, a1, 1
-; RV32ZICOND-NEXT:    sll a2, a6, a2
+; RV32ZICOND-NEXT:    slti a4, a4, 0
+; RV32ZICOND-NEXT:    sll a2, a3, a2
+; RV32ZICOND-NEXT:    czero.nez a3, a1, a4
 ; RV32ZICOND-NEXT:    or a0, a0, a2
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, a4
+; RV32ZICOND-NEXT:    czero.nez a2, a5, a4
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
-; RV32ZICOND-NEXT:    or a0, a0, a5
-; RV32ZICOND-NEXT:    czero.eqz a2, a3, a4
-; RV32ZICOND-NEXT:    srai a1, a1, 31
-; RV32ZICOND-NEXT:    czero.nez a1, a1, a4
-; RV32ZICOND-NEXT:    or a1, a2, a1
+; RV32ZICOND-NEXT:    or a0, a0, a3
+; RV32ZICOND-NEXT:    or a1, a1, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: ashr64:
@@ -604,19 +604,19 @@ define i64 @lshr64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND-LABEL: lshr64:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a4, a4, 1
+; RV32ZICOND-NEXT:    slli a3, a1, 1
 ; RV32ZICOND-NEXT:    czero.eqz a2, a2, a4
-; RV32ZICOND-NEXT:    srl a3, a1, a2
+; RV32ZICOND-NEXT:    srl a1, a1, a2
 ; RV32ZICOND-NEXT:    addi a4, a2, -32
-; RV32ZICOND-NEXT:    slti a4, a4, 0
-; RV32ZICOND-NEXT:    czero.nez a5, a3, a4
 ; RV32ZICOND-NEXT:    srl a0, a0, a2
 ; RV32ZICOND-NEXT:    not a2, a2
-; RV32ZICOND-NEXT:    slli a1, a1, 1
-; RV32ZICOND-NEXT:    sll a1, a1, a2
-; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    slti a4, a4, 0
+; RV32ZICOND-NEXT:    sll a2, a3, a2
+; RV32ZICOND-NEXT:    czero.nez a3, a1, a4
+; RV32ZICOND-NEXT:    or a0, a0, a2
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
-; RV32ZICOND-NEXT:    or a0, a0, a5
-; RV32ZICOND-NEXT:    czero.eqz a1, a3, a4
+; RV32ZICOND-NEXT:    or a0, a0, a3
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: lshr64:
@@ -636,10 +636,10 @@ define i64 @sub64(i64 %x, i64 %y, i1 %c) {
 ; RV32I-NEXT:    slli a4, a4, 31
 ; RV32I-NEXT:    srai a4, a4, 31
 ; RV32I-NEXT:    and a2, a4, a2
-; RV32I-NEXT:    sltu a5, a0, a2
 ; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    sltu a4, a0, a2
 ; RV32I-NEXT:    sub a1, a1, a3
-; RV32I-NEXT:    sub a1, a1, a5
+; RV32I-NEXT:    sub a1, a1, a4
 ; RV32I-NEXT:    sub a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
@@ -669,10 +669,10 @@ define i64 @sub64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a4, a4, 1
 ; RV32ZICOND-NEXT:    czero.eqz a2, a2, a4
-; RV32ZICOND-NEXT:    sltu a5, a0, a2
 ; RV32ZICOND-NEXT:    czero.eqz a3, a3, a4
+; RV32ZICOND-NEXT:    sltu a4, a0, a2
 ; RV32ZICOND-NEXT:    sub a1, a1, a3
-; RV32ZICOND-NEXT:    sub a1, a1, a5
+; RV32ZICOND-NEXT:    sub a1, a1, a4
 ; RV32ZICOND-NEXT:    sub a0, a0, a2
 ; RV32ZICOND-NEXT:    ret
 ;
@@ -728,8 +728,8 @@ define i64 @and64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND-NEXT:    and a3, a1, a3
 ; RV32ZICOND-NEXT:    and a2, a0, a2
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, a4
-; RV32ZICOND-NEXT:    or a0, a2, a0
 ; RV32ZICOND-NEXT:    czero.nez a1, a1, a4
+; RV32ZICOND-NEXT:    or a0, a2, a0
 ; RV32ZICOND-NEXT:    or a1, a3, a1
 ; RV32ZICOND-NEXT:    ret
 ;
@@ -752,8 +752,8 @@ define i64 @add64(i64 %x, i64 %y, i1 %c) {
 ; RV32I-NEXT:    slli a4, a4, 31
 ; RV32I-NEXT:    srai a4, a4, 31
 ; RV32I-NEXT:    and a3, a4, a3
-; RV32I-NEXT:    add a1, a1, a3
 ; RV32I-NEXT:    and a2, a4, a2
+; RV32I-NEXT:    add a1, a1, a3
 ; RV32I-NEXT:    add a2, a0, a2
 ; RV32I-NEXT:    sltu a0, a2, a0
 ; RV32I-NEXT:    add a1, a1, a0
@@ -786,8 +786,8 @@ define i64 @add64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a4, a4, 1
 ; RV32ZICOND-NEXT:    czero.eqz a3, a3, a4
-; RV32ZICOND-NEXT:    add a1, a1, a3
 ; RV32ZICOND-NEXT:    czero.eqz a2, a2, a4
+; RV32ZICOND-NEXT:    add a1, a1, a3
 ; RV32ZICOND-NEXT:    add a2, a0, a2
 ; RV32ZICOND-NEXT:    sltu a0, a2, a0
 ; RV32ZICOND-NEXT:    add a1, a1, a0
@@ -812,8 +812,8 @@ define i64 @or64(i64 %x, i64 %y, i1 %c) {
 ; RV32I-NEXT:    slli a4, a4, 31
 ; RV32I-NEXT:    srai a4, a4, 31
 ; RV32I-NEXT:    and a2, a4, a2
-; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
@@ -843,9 +843,9 @@ define i64 @or64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a4, a4, 1
 ; RV32ZICOND-NEXT:    czero.eqz a2, a2, a4
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a4
 ; RV32ZICOND-NEXT:    or a0, a0, a2
-; RV32ZICOND-NEXT:    czero.eqz a2, a3, a4
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a1, a1, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: or64:
@@ -865,8 +865,8 @@ define i64 @xor64(i64 %x, i64 %y, i1 %c) {
 ; RV32I-NEXT:    slli a4, a4, 31
 ; RV32I-NEXT:    srai a4, a4, 31
 ; RV32I-NEXT:    and a2, a4, a2
-; RV32I-NEXT:    xor a0, a0, a2
 ; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    xor a0, a0, a2
 ; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
@@ -896,9 +896,9 @@ define i64 @xor64(i64 %x, i64 %y, i1 %c) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a4, a4, 1
 ; RV32ZICOND-NEXT:    czero.eqz a2, a2, a4
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a4
 ; RV32ZICOND-NEXT:    xor a0, a0, a2
-; RV32ZICOND-NEXT:    czero.eqz a2, a3, a4
-; RV32ZICOND-NEXT:    xor a1, a1, a2
+; RV32ZICOND-NEXT:    xor a1, a1, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: xor64:

diff  --git a/llvm/test/CodeGen/RISCV/condops.ll b/llvm/test/CodeGen/RISCV/condops.ll
index 622365cf13bcef..6c2ba493ffcd57 100644
--- a/llvm/test/CodeGen/RISCV/condops.ll
+++ b/llvm/test/CodeGen/RISCV/condops.ll
@@ -208,8 +208,8 @@ define i64 @add1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    and a4, a0, a4
-; RV32I-NEXT:    add a2, a2, a4
 ; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    add a2, a2, a4
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    sltu a1, a0, a1
 ; RV32I-NEXT:    add a1, a2, a1
@@ -225,8 +225,8 @@ define i64 @add1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: add1:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a4, a0
-; RV32XVENTANACONDOPS-NEXT:    add a2, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    add a2, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    add a0, a1, a0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a1, a0, a1
 ; RV32XVENTANACONDOPS-NEXT:    add a1, a2, a1
@@ -247,8 +247,8 @@ define i64 @add1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: add1:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.eqz a4, a4, a0
-; RV32ZICOND-NEXT:    add a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.eqz a0, a3, a0
+; RV32ZICOND-NEXT:    add a2, a2, a4
 ; RV32ZICOND-NEXT:    add a0, a1, a0
 ; RV32ZICOND-NEXT:    sltu a1, a0, a1
 ; RV32ZICOND-NEXT:    add a1, a2, a1
@@ -269,11 +269,11 @@ define i64 @add2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    and a2, a0, a2
-; RV32I-NEXT:    add a2, a4, a2
 ; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    add a1, a4, a2
 ; RV32I-NEXT:    add a0, a3, a0
-; RV32I-NEXT:    sltu a1, a0, a3
-; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    sltu a2, a0, a3
+; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: add2:
@@ -286,11 +286,11 @@ define i64 @add2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: add2:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    add a2, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    add a1, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    add a0, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    sltu a1, a0, a3
-; RV32XVENTANACONDOPS-NEXT:    add a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    sltu a2, a0, a3
+; RV32XVENTANACONDOPS-NEXT:    add a1, a1, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: add2:
@@ -308,11 +308,11 @@ define i64 @add2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: add2:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.eqz a2, a2, a0
-; RV32ZICOND-NEXT:    add a2, a4, a2
 ; RV32ZICOND-NEXT:    czero.eqz a0, a1, a0
+; RV32ZICOND-NEXT:    add a1, a4, a2
 ; RV32ZICOND-NEXT:    add a0, a3, a0
-; RV32ZICOND-NEXT:    sltu a1, a0, a3
-; RV32ZICOND-NEXT:    add a1, a2, a1
+; RV32ZICOND-NEXT:    sltu a2, a0, a3
+; RV32ZICOND-NEXT:    add a1, a1, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: add2:
@@ -330,8 +330,8 @@ define i64 @add3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    and a4, a0, a4
-; RV32I-NEXT:    add a2, a2, a4
 ; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    add a2, a2, a4
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    sltu a1, a0, a1
 ; RV32I-NEXT:    add a1, a2, a1
@@ -347,8 +347,8 @@ define i64 @add3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: add3:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a4, a0
-; RV32XVENTANACONDOPS-NEXT:    add a2, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    add a2, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    add a0, a1, a0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a1, a0, a1
 ; RV32XVENTANACONDOPS-NEXT:    add a1, a2, a1
@@ -369,8 +369,8 @@ define i64 @add3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: add3:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.nez a4, a4, a0
-; RV32ZICOND-NEXT:    add a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.nez a0, a3, a0
+; RV32ZICOND-NEXT:    add a2, a2, a4
 ; RV32ZICOND-NEXT:    add a0, a1, a0
 ; RV32ZICOND-NEXT:    sltu a1, a0, a1
 ; RV32ZICOND-NEXT:    add a1, a2, a1
@@ -391,11 +391,11 @@ define i64 @add4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    and a2, a0, a2
-; RV32I-NEXT:    add a2, a4, a2
 ; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    add a1, a4, a2
 ; RV32I-NEXT:    add a0, a3, a0
-; RV32I-NEXT:    sltu a1, a0, a3
-; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    sltu a2, a0, a3
+; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: add4:
@@ -408,11 +408,11 @@ define i64 @add4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: add4:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    add a2, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV32XVENTANACONDOPS-NEXT:    add a1, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    add a0, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    sltu a1, a0, a3
-; RV32XVENTANACONDOPS-NEXT:    add a1, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    sltu a2, a0, a3
+; RV32XVENTANACONDOPS-NEXT:    add a1, a1, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: add4:
@@ -430,11 +430,11 @@ define i64 @add4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: add4:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.nez a2, a2, a0
-; RV32ZICOND-NEXT:    add a2, a4, a2
 ; RV32ZICOND-NEXT:    czero.nez a0, a1, a0
+; RV32ZICOND-NEXT:    add a1, a4, a2
 ; RV32ZICOND-NEXT:    add a0, a3, a0
-; RV32ZICOND-NEXT:    sltu a1, a0, a3
-; RV32ZICOND-NEXT:    add a1, a2, a1
+; RV32ZICOND-NEXT:    sltu a2, a0, a3
+; RV32ZICOND-NEXT:    add a1, a1, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: add4:
@@ -452,10 +452,10 @@ define i64 @sub1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    and a3, a0, a3
-; RV32I-NEXT:    sltu a5, a1, a3
 ; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    sltu a4, a1, a3
 ; RV32I-NEXT:    sub a2, a2, a0
-; RV32I-NEXT:    sub a2, a2, a5
+; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    sub a0, a1, a3
 ; RV32I-NEXT:    mv a1, a2
 ; RV32I-NEXT:    ret
@@ -470,10 +470,10 @@ define i64 @sub1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: sub1:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    sltu a5, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    sltu a4, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a5
+; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    sub a0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    mv a1, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
@@ -493,10 +493,10 @@ define i64 @sub1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: sub1:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.eqz a3, a3, a0
-; RV32ZICOND-NEXT:    sltu a5, a1, a3
 ; RV32ZICOND-NEXT:    czero.eqz a0, a4, a0
+; RV32ZICOND-NEXT:    sltu a4, a1, a3
 ; RV32ZICOND-NEXT:    sub a2, a2, a0
-; RV32ZICOND-NEXT:    sub a2, a2, a5
+; RV32ZICOND-NEXT:    sub a2, a2, a4
 ; RV32ZICOND-NEXT:    sub a0, a1, a3
 ; RV32ZICOND-NEXT:    mv a1, a2
 ; RV32ZICOND-NEXT:    ret
@@ -516,10 +516,10 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    and a3, a0, a3
-; RV32I-NEXT:    sltu a5, a1, a3
 ; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    sltu a4, a1, a3
 ; RV32I-NEXT:    sub a2, a2, a0
-; RV32I-NEXT:    sub a2, a2, a5
+; RV32I-NEXT:    sub a2, a2, a4
 ; RV32I-NEXT:    sub a0, a1, a3
 ; RV32I-NEXT:    mv a1, a2
 ; RV32I-NEXT:    ret
@@ -534,10 +534,10 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: sub2:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    sltu a5, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    sltu a4, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a5
+; RV32XVENTANACONDOPS-NEXT:    sub a2, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    sub a0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    mv a1, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
@@ -557,10 +557,10 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: sub2:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.nez a3, a3, a0
-; RV32ZICOND-NEXT:    sltu a5, a1, a3
 ; RV32ZICOND-NEXT:    czero.nez a0, a4, a0
+; RV32ZICOND-NEXT:    sltu a4, a1, a3
 ; RV32ZICOND-NEXT:    sub a2, a2, a0
-; RV32ZICOND-NEXT:    sub a2, a2, a5
+; RV32ZICOND-NEXT:    sub a2, a2, a4
 ; RV32ZICOND-NEXT:    sub a0, a1, a3
 ; RV32ZICOND-NEXT:    mv a1, a2
 ; RV32ZICOND-NEXT:    ret
@@ -578,11 +578,11 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 define i64 @or1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: or1:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    neg a5, a0
-; RV32I-NEXT:    and a0, a5, a3
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    and a1, a5, a4
-; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    and a3, a0, a3
+; RV32I-NEXT:    and a4, a0, a4
+; RV32I-NEXT:    or a0, a1, a3
+; RV32I-NEXT:    or a1, a2, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: or1:
@@ -595,10 +595,9 @@ define i64 @or1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: or1:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    or a3, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    or a1, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: or1:
@@ -616,10 +615,9 @@ define i64 @or1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: or1:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.eqz a3, a3, a0
-; RV32ZICOND-NEXT:    or a3, a1, a3
-; RV32ZICOND-NEXT:    czero.eqz a1, a4, a0
-; RV32ZICOND-NEXT:    or a1, a2, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.eqz a4, a4, a0
+; RV32ZICOND-NEXT:    or a0, a1, a3
+; RV32ZICOND-NEXT:    or a1, a2, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: or1:
@@ -635,11 +633,11 @@ define i64 @or1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 define i64 @or2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: or2:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    neg a5, a0
-; RV32I-NEXT:    and a0, a5, a1
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    and a1, a5, a2
-; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    and a1, a0, a1
+; RV32I-NEXT:    and a2, a0, a2
+; RV32I-NEXT:    or a0, a3, a1
+; RV32I-NEXT:    or a1, a4, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: or2:
@@ -652,10 +650,9 @@ define i64 @or2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: or2:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a0
-; RV32XVENTANACONDOPS-NEXT:    or a3, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: or2:
@@ -673,10 +670,9 @@ define i64 @or2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: or2:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.eqz a1, a1, a0
-; RV32ZICOND-NEXT:    or a3, a3, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a2, a0
-; RV32ZICOND-NEXT:    or a1, a4, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.eqz a2, a2, a0
+; RV32ZICOND-NEXT:    or a0, a3, a1
+; RV32ZICOND-NEXT:    or a1, a4, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: or2:
@@ -692,11 +688,11 @@ define i64 @or2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 define i64 @or3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: or3:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a5, a0, -1
-; RV32I-NEXT:    and a0, a5, a3
-; RV32I-NEXT:    or a0, a1, a0
-; RV32I-NEXT:    and a1, a5, a4
-; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    and a3, a0, a3
+; RV32I-NEXT:    and a4, a0, a4
+; RV32I-NEXT:    or a0, a1, a3
+; RV32I-NEXT:    or a1, a2, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: or3:
@@ -709,10 +705,9 @@ define i64 @or3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: or3:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    or a3, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    or a1, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: or3:
@@ -730,10 +725,9 @@ define i64 @or3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: or3:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.nez a3, a3, a0
-; RV32ZICOND-NEXT:    or a3, a1, a3
-; RV32ZICOND-NEXT:    czero.nez a1, a4, a0
-; RV32ZICOND-NEXT:    or a1, a2, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.nez a4, a4, a0
+; RV32ZICOND-NEXT:    or a0, a1, a3
+; RV32ZICOND-NEXT:    or a1, a2, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: or3:
@@ -749,11 +743,11 @@ define i64 @or3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 define i64 @or4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: or4:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a5, a0, -1
-; RV32I-NEXT:    and a0, a5, a1
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    and a1, a5, a2
-; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    and a1, a0, a1
+; RV32I-NEXT:    and a2, a0, a2
+; RV32I-NEXT:    or a0, a3, a1
+; RV32I-NEXT:    or a1, a4, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: or4:
@@ -766,10 +760,9 @@ define i64 @or4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: or4:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a0
-; RV32XVENTANACONDOPS-NEXT:    or a3, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: or4:
@@ -787,10 +780,9 @@ define i64 @or4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: or4:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.nez a1, a1, a0
-; RV32ZICOND-NEXT:    or a3, a3, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a2, a0
-; RV32ZICOND-NEXT:    or a1, a4, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a0
+; RV32ZICOND-NEXT:    or a0, a3, a1
+; RV32ZICOND-NEXT:    or a1, a4, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: or4:
@@ -806,11 +798,11 @@ define i64 @or4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 define i64 @xor1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: xor1:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    neg a5, a0
-; RV32I-NEXT:    and a0, a5, a3
-; RV32I-NEXT:    xor a0, a1, a0
-; RV32I-NEXT:    and a1, a5, a4
-; RV32I-NEXT:    xor a1, a2, a1
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    and a3, a0, a3
+; RV32I-NEXT:    and a4, a0, a4
+; RV32I-NEXT:    xor a0, a1, a3
+; RV32I-NEXT:    xor a1, a2, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: xor1:
@@ -823,10 +815,9 @@ define i64 @xor1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: xor1:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    xor a3, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
-; RV32XVENTANACONDOPS-NEXT:    xor a1, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: xor1:
@@ -844,10 +835,9 @@ define i64 @xor1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: xor1:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.eqz a3, a3, a0
-; RV32ZICOND-NEXT:    xor a3, a1, a3
-; RV32ZICOND-NEXT:    czero.eqz a1, a4, a0
-; RV32ZICOND-NEXT:    xor a1, a2, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.eqz a4, a4, a0
+; RV32ZICOND-NEXT:    xor a0, a1, a3
+; RV32ZICOND-NEXT:    xor a1, a2, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: xor1:
@@ -863,11 +853,11 @@ define i64 @xor1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 define i64 @xor2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: xor2:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    neg a5, a0
-; RV32I-NEXT:    and a0, a5, a1
-; RV32I-NEXT:    xor a0, a3, a0
-; RV32I-NEXT:    and a1, a5, a2
-; RV32I-NEXT:    xor a1, a4, a1
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    and a1, a0, a1
+; RV32I-NEXT:    and a2, a0, a2
+; RV32I-NEXT:    xor a0, a3, a1
+; RV32I-NEXT:    xor a1, a4, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: xor2:
@@ -880,10 +870,9 @@ define i64 @xor2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: xor2:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a0
-; RV32XVENTANACONDOPS-NEXT:    xor a3, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    xor a1, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: xor2:
@@ -901,10 +890,9 @@ define i64 @xor2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: xor2:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.eqz a1, a1, a0
-; RV32ZICOND-NEXT:    xor a3, a3, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a2, a0
-; RV32ZICOND-NEXT:    xor a1, a4, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.eqz a2, a2, a0
+; RV32ZICOND-NEXT:    xor a0, a3, a1
+; RV32ZICOND-NEXT:    xor a1, a4, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: xor2:
@@ -920,11 +908,11 @@ define i64 @xor2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 define i64 @xor3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: xor3:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a5, a0, -1
-; RV32I-NEXT:    and a0, a5, a3
-; RV32I-NEXT:    xor a0, a1, a0
-; RV32I-NEXT:    and a1, a5, a4
-; RV32I-NEXT:    xor a1, a2, a1
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    and a3, a0, a3
+; RV32I-NEXT:    and a4, a0, a4
+; RV32I-NEXT:    xor a0, a1, a3
+; RV32I-NEXT:    xor a1, a2, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: xor3:
@@ -937,10 +925,9 @@ define i64 @xor3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: xor3:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    xor a3, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
-; RV32XVENTANACONDOPS-NEXT:    xor a1, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: xor3:
@@ -958,10 +945,9 @@ define i64 @xor3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: xor3:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.nez a3, a3, a0
-; RV32ZICOND-NEXT:    xor a3, a1, a3
-; RV32ZICOND-NEXT:    czero.nez a1, a4, a0
-; RV32ZICOND-NEXT:    xor a1, a2, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.nez a4, a4, a0
+; RV32ZICOND-NEXT:    xor a0, a1, a3
+; RV32ZICOND-NEXT:    xor a1, a2, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: xor3:
@@ -977,11 +963,11 @@ define i64 @xor3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 define i64 @xor4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: xor4:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi a5, a0, -1
-; RV32I-NEXT:    and a0, a5, a1
-; RV32I-NEXT:    xor a0, a3, a0
-; RV32I-NEXT:    and a1, a5, a2
-; RV32I-NEXT:    xor a1, a4, a1
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    and a1, a0, a1
+; RV32I-NEXT:    and a2, a0, a2
+; RV32I-NEXT:    xor a0, a3, a1
+; RV32I-NEXT:    xor a1, a4, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: xor4:
@@ -994,10 +980,9 @@ define i64 @xor4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: xor4:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a0
-; RV32XVENTANACONDOPS-NEXT:    xor a3, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    xor a1, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    xor a0, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    xor a1, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: xor4:
@@ -1015,10 +1000,9 @@ define i64 @xor4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: xor4:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.nez a1, a1, a0
-; RV32ZICOND-NEXT:    xor a3, a3, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a2, a0
-; RV32ZICOND-NEXT:    xor a1, a4, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a0
+; RV32ZICOND-NEXT:    xor a0, a3, a1
+; RV32ZICOND-NEXT:    xor a1, a4, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: xor4:
@@ -1057,10 +1041,9 @@ define i64 @and1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-NEXT:    and a4, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    and a3, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a0
-; RV32XVENTANACONDOPS-NEXT:    or a3, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: and1:
@@ -1082,10 +1065,9 @@ define i64 @and1(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-NEXT:    and a4, a2, a4
 ; RV32ZICOND-NEXT:    and a3, a1, a3
 ; RV32ZICOND-NEXT:    czero.nez a1, a1, a0
-; RV32ZICOND-NEXT:    or a3, a3, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a2, a0
-; RV32ZICOND-NEXT:    or a1, a4, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a0
+; RV32ZICOND-NEXT:    or a0, a3, a1
+; RV32ZICOND-NEXT:    or a1, a4, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: and1:
@@ -1122,13 +1104,12 @@ define i64 @and2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ;
 ; RV32XVENTANACONDOPS-LABEL: and2:
 ; RV32XVENTANACONDOPS:       # %bb.0:
-; RV32XVENTANACONDOPS-NEXT:    and a5, a2, a4
+; RV32XVENTANACONDOPS-NEXT:    and a2, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    and a1, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    or a2, a1, a2
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    or a1, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: and2:
@@ -1147,13 +1128,12 @@ define i64 @and2(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ;
 ; RV32ZICOND-LABEL: and2:
 ; RV32ZICOND:       # %bb.0:
-; RV32ZICOND-NEXT:    and a5, a2, a4
+; RV32ZICOND-NEXT:    and a2, a2, a4
 ; RV32ZICOND-NEXT:    and a1, a1, a3
-; RV32ZICOND-NEXT:    czero.nez a2, a3, a0
-; RV32ZICOND-NEXT:    or a2, a1, a2
-; RV32ZICOND-NEXT:    czero.nez a1, a4, a0
-; RV32ZICOND-NEXT:    or a1, a5, a1
-; RV32ZICOND-NEXT:    mv a0, a2
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a0
+; RV32ZICOND-NEXT:    czero.nez a4, a4, a0
+; RV32ZICOND-NEXT:    or a0, a1, a3
+; RV32ZICOND-NEXT:    or a1, a2, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: and2:
@@ -1193,10 +1173,9 @@ define i64 @and3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-NEXT:    and a4, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    and a3, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a0
-; RV32XVENTANACONDOPS-NEXT:    or a3, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a3, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a2
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: and3:
@@ -1218,10 +1197,9 @@ define i64 @and3(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-NEXT:    and a4, a2, a4
 ; RV32ZICOND-NEXT:    and a3, a1, a3
 ; RV32ZICOND-NEXT:    czero.eqz a1, a1, a0
-; RV32ZICOND-NEXT:    or a3, a3, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a2, a0
-; RV32ZICOND-NEXT:    or a1, a4, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.eqz a2, a2, a0
+; RV32ZICOND-NEXT:    or a0, a3, a1
+; RV32ZICOND-NEXT:    or a1, a4, a2
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: and3:
@@ -1258,13 +1236,12 @@ define i64 @and4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ;
 ; RV32XVENTANACONDOPS-LABEL: and4:
 ; RV32XVENTANACONDOPS:       # %bb.0:
-; RV32XVENTANACONDOPS-NEXT:    and a5, a2, a4
+; RV32XVENTANACONDOPS-NEXT:    and a2, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    and a1, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a3, a0
-; RV32XVENTANACONDOPS-NEXT:    or a2, a1, a2
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    or a1, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: and4:
@@ -1283,13 +1260,12 @@ define i64 @and4(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ;
 ; RV32ZICOND-LABEL: and4:
 ; RV32ZICOND:       # %bb.0:
-; RV32ZICOND-NEXT:    and a5, a2, a4
+; RV32ZICOND-NEXT:    and a2, a2, a4
 ; RV32ZICOND-NEXT:    and a1, a1, a3
-; RV32ZICOND-NEXT:    czero.eqz a2, a3, a0
-; RV32ZICOND-NEXT:    or a2, a1, a2
-; RV32ZICOND-NEXT:    czero.eqz a1, a4, a0
-; RV32ZICOND-NEXT:    or a1, a5, a1
-; RV32ZICOND-NEXT:    mv a0, a2
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a4, a0
+; RV32ZICOND-NEXT:    or a0, a1, a3
+; RV32ZICOND-NEXT:    or a1, a2, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: and4:
@@ -1328,11 +1304,10 @@ define i64 @basic(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, a0
-; RV32XVENTANACONDOPS-NEXT:    or a3, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    mv a0, a3
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a1, a3
+; RV32XVENTANACONDOPS-NEXT:    or a1, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: basic:
@@ -1352,11 +1327,10 @@ define i64 @basic(i1 zeroext %rc, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    czero.nez a3, a3, a0
 ; RV32ZICOND-NEXT:    czero.eqz a1, a1, a0
-; RV32ZICOND-NEXT:    or a3, a1, a3
-; RV32ZICOND-NEXT:    czero.nez a1, a4, a0
-; RV32ZICOND-NEXT:    czero.eqz a0, a2, a0
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    mv a0, a3
+; RV32ZICOND-NEXT:    czero.nez a4, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a2, a2, a0
+; RV32ZICOND-NEXT:    or a0, a1, a3
+; RV32ZICOND-NEXT:    or a1, a2, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: basic:
@@ -1374,14 +1348,14 @@ define i64 @seteq(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    or a2, a0, a1
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    or a1, a0, a1
 ; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    beqz a2, .LBB23_2
+; RV32I-NEXT:    beqz a1, .LBB23_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a6
-; RV32I-NEXT:    mv a1, a7
+; RV32I-NEXT:    mv a5, a7
 ; RV32I-NEXT:  .LBB23_2:
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: seteq:
@@ -1397,13 +1371,13 @@ define i64 @seteq(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor a1, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    xor a0, a0, a2
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: seteq:
@@ -1425,13 +1399,13 @@ define i64 @seteq(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor a1, a1, a3
 ; RV32ZICOND-NEXT:    xor a0, a0, a2
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.eqz a0, a6, a1
-; RV32ZICOND-NEXT:    czero.nez a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.eqz a2, a7, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a6, a0
+; RV32ZICOND-NEXT:    czero.nez a2, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a3, a7, a0
+; RV32ZICOND-NEXT:    czero.nez a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: seteq:
@@ -1451,14 +1425,14 @@ define i64 @setne(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    or a2, a0, a1
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    or a1, a0, a1
 ; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    bnez a2, .LBB24_2
+; RV32I-NEXT:    bnez a1, .LBB24_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a6
-; RV32I-NEXT:    mv a1, a7
+; RV32I-NEXT:    mv a5, a7
 ; RV32I-NEXT:  .LBB24_2:
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: setne:
@@ -1474,13 +1448,13 @@ define i64 @setne(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor a1, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    xor a0, a0, a2
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setne:
@@ -1502,13 +1476,13 @@ define i64 @setne(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor a1, a1, a3
 ; RV32ZICOND-NEXT:    xor a0, a0, a2
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.nez a0, a6, a1
-; RV32ZICOND-NEXT:    czero.eqz a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.nez a2, a7, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.nez a1, a6, a0
+; RV32ZICOND-NEXT:    czero.eqz a2, a4, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a7, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setne:
@@ -1555,16 +1529,16 @@ define i64 @setgt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    slt a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setgt:
@@ -1586,16 +1560,16 @@ define i64 @setgt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor t0, a1, a3
 ; RV32ZICOND-NEXT:    slt a1, a3, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    sltu a0, a2, a0
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.nez a0, a6, a1
-; RV32ZICOND-NEXT:    czero.eqz a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.nez a2, a7, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.nez a1, a6, a0
+; RV32ZICOND-NEXT:    czero.eqz a2, a4, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a7, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setgt:
@@ -1642,16 +1616,16 @@ define i64 @setge(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    slt a1, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setge:
@@ -1673,16 +1647,16 @@ define i64 @setge(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor t0, a1, a3
 ; RV32ZICOND-NEXT:    slt a1, a1, a3
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    sltu a0, a0, a2
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.eqz a0, a6, a1
-; RV32ZICOND-NEXT:    czero.nez a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.eqz a2, a7, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a6, a0
+; RV32ZICOND-NEXT:    czero.nez a2, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a3, a7, a0
+; RV32ZICOND-NEXT:    czero.nez a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setge:
@@ -1729,16 +1703,16 @@ define i64 @setlt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    slt a1, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setlt:
@@ -1760,16 +1734,16 @@ define i64 @setlt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor t0, a1, a3
 ; RV32ZICOND-NEXT:    slt a1, a1, a3
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    sltu a0, a0, a2
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.nez a0, a6, a1
-; RV32ZICOND-NEXT:    czero.eqz a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.nez a2, a7, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.nez a1, a6, a0
+; RV32ZICOND-NEXT:    czero.eqz a2, a4, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a7, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setlt:
@@ -1816,16 +1790,16 @@ define i64 @setle(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    slt a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setle:
@@ -1847,16 +1821,16 @@ define i64 @setle(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor t0, a1, a3
 ; RV32ZICOND-NEXT:    slt a1, a3, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    sltu a0, a2, a0
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.eqz a0, a6, a1
-; RV32ZICOND-NEXT:    czero.nez a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.eqz a2, a7, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a6, a0
+; RV32ZICOND-NEXT:    czero.nez a2, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a3, a7, a0
+; RV32ZICOND-NEXT:    czero.nez a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setle:
@@ -1903,16 +1877,16 @@ define i64 @setugt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    sltu a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setugt:
@@ -1934,16 +1908,16 @@ define i64 @setugt(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor t0, a1, a3
 ; RV32ZICOND-NEXT:    sltu a1, a3, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    sltu a0, a2, a0
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.nez a0, a6, a1
-; RV32ZICOND-NEXT:    czero.eqz a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.nez a2, a7, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.nez a1, a6, a0
+; RV32ZICOND-NEXT:    czero.eqz a2, a4, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a7, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setugt:
@@ -1990,16 +1964,16 @@ define i64 @setuge(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    sltu a1, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setuge:
@@ -2021,16 +1995,16 @@ define i64 @setuge(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor t0, a1, a3
 ; RV32ZICOND-NEXT:    sltu a1, a1, a3
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    sltu a0, a0, a2
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.eqz a0, a6, a1
-; RV32ZICOND-NEXT:    czero.nez a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.eqz a2, a7, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a6, a0
+; RV32ZICOND-NEXT:    czero.nez a2, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a3, a7, a0
+; RV32ZICOND-NEXT:    czero.nez a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setuge:
@@ -2077,16 +2051,16 @@ define i64 @setult(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    sltu a1, a1, a3
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setult:
@@ -2108,16 +2082,16 @@ define i64 @setult(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor t0, a1, a3
 ; RV32ZICOND-NEXT:    sltu a1, a1, a3
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    sltu a0, a0, a2
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.nez a0, a6, a1
-; RV32ZICOND-NEXT:    czero.eqz a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.nez a2, a7, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.nez a1, a6, a0
+; RV32ZICOND-NEXT:    czero.eqz a2, a4, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a7, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setult:
@@ -2164,16 +2138,16 @@ define i64 @setule(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xor t0, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    sltu a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    sltu a0, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a1, t0
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, t0
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a6, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a7, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a6, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a7, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a4, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setule:
@@ -2195,16 +2169,16 @@ define i64 @setule(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xor t0, a1, a3
 ; RV32ZICOND-NEXT:    sltu a1, a3, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    sltu a0, a2, a0
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, t0
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.eqz a0, a6, a1
-; RV32ZICOND-NEXT:    czero.nez a2, a4, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.eqz a2, a7, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a5, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a6, a0
+; RV32ZICOND-NEXT:    czero.nez a2, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a3, a7, a0
+; RV32ZICOND-NEXT:    czero.nez a4, a5, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a4, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setule:
@@ -2222,14 +2196,14 @@ define i64 @setule(i64 %a, i64 %b, i64 %rs1, i64 %rs2) {
 define i64 @seteq_zero(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: seteq_zero:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    or a6, a0, a1
-; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    or a1, a0, a1
 ; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    beqz a6, .LBB33_2
+; RV32I-NEXT:    beqz a1, .LBB33_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB33_2:
+; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: seteq_zero:
@@ -2243,13 +2217,13 @@ define i64 @seteq_zero(i64 %a, i64 %rs1, i64 %rs2) {
 ;
 ; RV32XVENTANACONDOPS-LABEL: seteq_zero:
 ; RV32XVENTANACONDOPS:       # %bb.0:
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a3, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: seteq_zero:
@@ -2267,13 +2241,13 @@ define i64 @seteq_zero(i64 %a, i64 %rs1, i64 %rs2) {
 ;
 ; RV32ZICOND-LABEL: seteq_zero:
 ; RV32ZICOND:       # %bb.0:
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.eqz a0, a4, a1
-; RV32ZICOND-NEXT:    czero.nez a2, a2, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.eqz a2, a5, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a3, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a4, a0
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a5, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a3, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: seteq_zero:
@@ -2290,14 +2264,14 @@ define i64 @seteq_zero(i64 %a, i64 %rs1, i64 %rs2) {
 define i64 @setne_zero(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: setne_zero:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    or a6, a0, a1
-; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    or a1, a0, a1
 ; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    bnez a6, .LBB34_2
+; RV32I-NEXT:    bnez a1, .LBB34_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB34_2:
+; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: setne_zero:
@@ -2311,13 +2285,13 @@ define i64 @setne_zero(i64 %a, i64 %rs1, i64 %rs2) {
 ;
 ; RV32XVENTANACONDOPS-LABEL: setne_zero:
 ; RV32XVENTANACONDOPS:       # %bb.0:
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a3, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setne_zero:
@@ -2335,13 +2309,13 @@ define i64 @setne_zero(i64 %a, i64 %rs1, i64 %rs2) {
 ;
 ; RV32ZICOND-LABEL: setne_zero:
 ; RV32ZICOND:       # %bb.0:
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.nez a0, a4, a1
-; RV32ZICOND-NEXT:    czero.eqz a2, a2, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.nez a2, a5, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a3, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.nez a1, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a2, a2, a0
+; RV32ZICOND-NEXT:    czero.nez a4, a5, a0
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a3, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setne_zero:
@@ -2359,14 +2333,14 @@ define i64 @seteq_constant(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: seteq_constant:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    xori a0, a0, 123
-; RV32I-NEXT:    or a6, a0, a1
-; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    or a1, a0, a1
 ; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    beqz a6, .LBB35_2
+; RV32I-NEXT:    beqz a1, .LBB35_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB35_2:
+; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: seteq_constant:
@@ -2382,13 +2356,13 @@ define i64 @seteq_constant(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: seteq_constant:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, 123
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a3, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: seteq_constant:
@@ -2409,13 +2383,13 @@ define i64 @seteq_constant(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: seteq_constant:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xori a0, a0, 123
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.eqz a0, a4, a1
-; RV32ZICOND-NEXT:    czero.nez a2, a2, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.eqz a2, a5, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a3, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a4, a0
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a5, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a3, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: seteq_constant:
@@ -2434,14 +2408,14 @@ define i64 @setne_constant(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: setne_constant:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    xori a0, a0, 456
-; RV32I-NEXT:    or a6, a0, a1
-; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    or a1, a0, a1
 ; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    bnez a6, .LBB36_2
+; RV32I-NEXT:    bnez a1, .LBB36_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB36_2:
+; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: setne_constant:
@@ -2457,13 +2431,13 @@ define i64 @setne_constant(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: setne_constant:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, 456
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a3, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setne_constant:
@@ -2484,13 +2458,13 @@ define i64 @setne_constant(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: setne_constant:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    xori a0, a0, 456
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.nez a0, a4, a1
-; RV32ZICOND-NEXT:    czero.eqz a2, a2, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.nez a2, a5, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a3, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.nez a1, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a2, a2, a0
+; RV32ZICOND-NEXT:    czero.nez a4, a5, a0
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a3, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setne_constant:
@@ -2509,14 +2483,14 @@ define i64 @seteq_2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32I-LABEL: seteq_2048:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    binvi a0, a0, 11
-; RV32I-NEXT:    or a6, a0, a1
-; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    or a1, a0, a1
 ; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    beqz a6, .LBB37_2
+; RV32I-NEXT:    beqz a1, .LBB37_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB37_2:
+; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: seteq_2048:
@@ -2532,13 +2506,13 @@ define i64 @seteq_2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-LABEL: seteq_2048:
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    binvi a0, a0, 11
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a3, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: seteq_2048:
@@ -2559,13 +2533,13 @@ define i64 @seteq_2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-LABEL: seteq_2048:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    binvi a0, a0, 11
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.eqz a0, a4, a1
-; RV32ZICOND-NEXT:    czero.nez a2, a2, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.eqz a2, a5, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a3, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a4, a0
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a5, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a3, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: seteq_2048:
@@ -2585,14 +2559,14 @@ define i64 @seteq_neg2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    not a1, a1
 ; RV32I-NEXT:    xori a0, a0, -2048
-; RV32I-NEXT:    or a6, a0, a1
-; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    or a1, a0, a1
 ; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    beqz a6, .LBB38_2
+; RV32I-NEXT:    beqz a1, .LBB38_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB38_2:
+; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: seteq_neg2048:
@@ -2609,13 +2583,13 @@ define i64 @seteq_neg2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    not a1, a1
 ; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a0, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a3, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: seteq_neg2048:
@@ -2637,13 +2611,13 @@ define i64 @seteq_neg2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    not a1, a1
 ; RV32ZICOND-NEXT:    xori a0, a0, -2048
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.eqz a0, a4, a1
-; RV32ZICOND-NEXT:    czero.nez a2, a2, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.eqz a2, a5, a1
-; RV32ZICOND-NEXT:    czero.nez a1, a3, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a4, a0
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a0
+; RV32ZICOND-NEXT:    czero.eqz a4, a5, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a3, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: seteq_neg2048:
@@ -2663,14 +2637,14 @@ define i64 @setne_neg2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    not a1, a1
 ; RV32I-NEXT:    xori a0, a0, -2048
-; RV32I-NEXT:    or a6, a0, a1
-; RV32I-NEXT:    mv a1, a3
+; RV32I-NEXT:    or a1, a0, a1
 ; RV32I-NEXT:    mv a0, a2
-; RV32I-NEXT:    bnez a6, .LBB39_2
+; RV32I-NEXT:    bnez a1, .LBB39_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a4
-; RV32I-NEXT:    mv a1, a5
+; RV32I-NEXT:    mv a3, a5
 ; RV32I-NEXT:  .LBB39_2:
+; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: setne_neg2048:
@@ -2687,13 +2661,13 @@ define i64 @setne_neg2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS:       # %bb.0:
 ; RV32XVENTANACONDOPS-NEXT:    not a1, a1
 ; RV32XVENTANACONDOPS-NEXT:    xori a0, a0, -2048
-; RV32XVENTANACONDOPS-NEXT:    or a1, a0, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a4, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a1
-; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a0
-; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a2, a5, a1
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a1, a3, a1
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a1
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a4, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a4, a5, a0
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a0
+; RV32XVENTANACONDOPS-NEXT:    or a0, a2, a1
+; RV32XVENTANACONDOPS-NEXT:    or a1, a3, a4
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setne_neg2048:
@@ -2715,13 +2689,13 @@ define i64 @setne_neg2048(i64 %a, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    not a1, a1
 ; RV32ZICOND-NEXT:    xori a0, a0, -2048
-; RV32ZICOND-NEXT:    or a1, a0, a1
-; RV32ZICOND-NEXT:    czero.nez a0, a4, a1
-; RV32ZICOND-NEXT:    czero.eqz a2, a2, a1
-; RV32ZICOND-NEXT:    or a0, a2, a0
-; RV32ZICOND-NEXT:    czero.nez a2, a5, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a3, a1
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    czero.nez a1, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a2, a2, a0
+; RV32ZICOND-NEXT:    czero.nez a4, a5, a0
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a0
+; RV32ZICOND-NEXT:    or a0, a2, a1
+; RV32ZICOND-NEXT:    or a1, a3, a4
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setne_neg2048:
@@ -4097,10 +4071,10 @@ define i64 @setune_64(float %a, float %b, i64 %rs1, i64 %rs2) {
 ; RV32XVENTANACONDOPS-NEXT:    feq.s a4, fa0, fa1
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a2, a4
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a0, a0, a4
-; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a2
-; RV32XVENTANACONDOPS-NEXT:    vt.maskc a2, a3, a4
+; RV32XVENTANACONDOPS-NEXT:    vt.maskc a3, a3, a4
 ; RV32XVENTANACONDOPS-NEXT:    vt.maskcn a1, a1, a4
-; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a2
+; RV32XVENTANACONDOPS-NEXT:    or a0, a0, a2
+; RV32XVENTANACONDOPS-NEXT:    or a1, a1, a3
 ; RV32XVENTANACONDOPS-NEXT:    ret
 ;
 ; RV64XVENTANACONDOPS-LABEL: setune_64:
@@ -4122,10 +4096,10 @@ define i64 @setune_64(float %a, float %b, i64 %rs1, i64 %rs2) {
 ; RV32ZICOND-NEXT:    feq.s a4, fa0, fa1
 ; RV32ZICOND-NEXT:    czero.eqz a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, a4
-; RV32ZICOND-NEXT:    or a0, a0, a2
-; RV32ZICOND-NEXT:    czero.eqz a2, a3, a4
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a4
 ; RV32ZICOND-NEXT:    czero.nez a1, a1, a4
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a2
+; RV32ZICOND-NEXT:    or a1, a1, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: setune_64:

diff  --git a/llvm/test/CodeGen/RISCV/copysign-casts.ll b/llvm/test/CodeGen/RISCV/copysign-casts.ll
index 3b376626a783d2..53de36f1699a93 100644
--- a/llvm/test/CodeGen/RISCV/copysign-casts.ll
+++ b/llvm/test/CodeGen/RISCV/copysign-casts.ll
@@ -45,8 +45,8 @@ define double @fold_promote_d_s(double %a, float %b) nounwind {
 ; RV32I-LABEL: fold_promote_d_s:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a3, 524288
-; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    ret
@@ -54,9 +54,9 @@ define double @fold_promote_d_s(double %a, float %b) nounwind {
 ; RV64I-LABEL: fold_promote_d_s:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 524288
+; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -65,8 +65,8 @@ define double @fold_promote_d_s(double %a, float %b) nounwind {
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    fmv.x.w a2, fa0
 ; RV32IF-NEXT:    lui a3, 524288
-; RV32IF-NEXT:    and a2, a2, a3
 ; RV32IF-NEXT:    slli a1, a1, 1
+; RV32IF-NEXT:    and a2, a2, a3
 ; RV32IF-NEXT:    srli a1, a1, 1
 ; RV32IF-NEXT:    or a1, a1, a2
 ; RV32IF-NEXT:    ret
@@ -87,8 +87,8 @@ define double @fold_promote_d_s(double %a, float %b) nounwind {
 ; RV32IFZFH:       # %bb.0:
 ; RV32IFZFH-NEXT:    fmv.x.w a2, fa0
 ; RV32IFZFH-NEXT:    lui a3, 524288
-; RV32IFZFH-NEXT:    and a2, a2, a3
 ; RV32IFZFH-NEXT:    slli a1, a1, 1
+; RV32IFZFH-NEXT:    and a2, a2, a3
 ; RV32IFZFH-NEXT:    srli a1, a1, 1
 ; RV32IFZFH-NEXT:    or a1, a1, a2
 ; RV32IFZFH-NEXT:    ret
@@ -109,8 +109,8 @@ define double @fold_promote_d_s(double %a, float %b) nounwind {
 ; RV32IFZFHMIN:       # %bb.0:
 ; RV32IFZFHMIN-NEXT:    fmv.x.w a2, fa0
 ; RV32IFZFHMIN-NEXT:    lui a3, 524288
-; RV32IFZFHMIN-NEXT:    and a2, a2, a3
 ; RV32IFZFHMIN-NEXT:    slli a1, a1, 1
+; RV32IFZFHMIN-NEXT:    and a2, a2, a3
 ; RV32IFZFHMIN-NEXT:    srli a1, a1, 1
 ; RV32IFZFHMIN-NEXT:    or a1, a1, a2
 ; RV32IFZFHMIN-NEXT:    ret
@@ -147,9 +147,9 @@ define double @fold_promote_d_h(double %a, half %b) nounwind {
 ; RV32I-LABEL: fold_promote_d_h:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a3, 8
+; RV32I-NEXT:    slli a1, a1, 1
 ; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    slli a2, a2, 16
-; RV32I-NEXT:    slli a1, a1, 1
 ; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    ret
@@ -157,9 +157,9 @@ define double @fold_promote_d_h(double %a, half %b) nounwind {
 ; RV64I-LABEL: fold_promote_d_h:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 8
+; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a1, a1, 48
-; RV64I-NEXT:    slli a0, a0, 1
 ; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -168,9 +168,9 @@ define double @fold_promote_d_h(double %a, half %b) nounwind {
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    fmv.x.w a2, fa0
 ; RV32IF-NEXT:    lui a3, 8
+; RV32IF-NEXT:    slli a1, a1, 1
 ; RV32IF-NEXT:    and a2, a2, a3
 ; RV32IF-NEXT:    slli a2, a2, 16
-; RV32IF-NEXT:    slli a1, a1, 1
 ; RV32IF-NEXT:    srli a1, a1, 1
 ; RV32IF-NEXT:    or a1, a1, a2
 ; RV32IF-NEXT:    ret
@@ -209,9 +209,9 @@ define double @fold_promote_d_h(double %a, half %b) nounwind {
 ; RV32IFZFH:       # %bb.0:
 ; RV32IFZFH-NEXT:    fmv.x.h a2, fa0
 ; RV32IFZFH-NEXT:    lui a3, 8
+; RV32IFZFH-NEXT:    slli a1, a1, 1
 ; RV32IFZFH-NEXT:    and a2, a2, a3
 ; RV32IFZFH-NEXT:    slli a2, a2, 16
-; RV32IFZFH-NEXT:    slli a1, a1, 1
 ; RV32IFZFH-NEXT:    srli a1, a1, 1
 ; RV32IFZFH-NEXT:    or a1, a1, a2
 ; RV32IFZFH-NEXT:    ret
@@ -232,9 +232,9 @@ define double @fold_promote_d_h(double %a, half %b) nounwind {
 ; RV32IFZFHMIN:       # %bb.0:
 ; RV32IFZFHMIN-NEXT:    fmv.x.h a2, fa0
 ; RV32IFZFHMIN-NEXT:    lui a3, 8
+; RV32IFZFHMIN-NEXT:    slli a1, a1, 1
 ; RV32IFZFHMIN-NEXT:    and a2, a2, a3
 ; RV32IFZFHMIN-NEXT:    slli a2, a2, 16
-; RV32IFZFHMIN-NEXT:    slli a1, a1, 1
 ; RV32IFZFHMIN-NEXT:    srli a1, a1, 1
 ; RV32IFZFHMIN-NEXT:    or a1, a1, a2
 ; RV32IFZFHMIN-NEXT:    ret
@@ -292,9 +292,9 @@ define float @fold_promote_f_h(float %a, half %b) nounwind {
 ; RV32I-LABEL: fold_promote_f_h:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 8
+; RV32I-NEXT:    slli a0, a0, 1
 ; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a1, a1, 16
-; RV32I-NEXT:    slli a0, a0, 1
 ; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -302,9 +302,9 @@ define float @fold_promote_f_h(float %a, half %b) nounwind {
 ; RV64I-LABEL: fold_promote_f_h:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 8
+; RV64I-NEXT:    slli a0, a0, 33
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slliw a1, a1, 16
-; RV64I-NEXT:    slli a0, a0, 33
 ; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -423,8 +423,8 @@ define float @fold_demote_s_d(float %a, double %b) nounwind {
 ; RV32I-LABEL: fold_demote_s_d:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a1, 524288
-; RV32I-NEXT:    and a1, a2, a1
 ; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
 ; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -432,8 +432,8 @@ define float @fold_demote_s_d(float %a, double %b) nounwind {
 ; RV64I-LABEL: fold_demote_s_d:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 33
-; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    srli a1, a1, 63
+; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    slli a1, a1, 63
 ; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    or a0, a0, a1
@@ -515,9 +515,9 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV32I-LABEL: fold_demote_h_s:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 524288
+; RV32I-NEXT:    slli a0, a0, 17
 ; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    srli a1, a1, 16
-; RV32I-NEXT:    slli a0, a0, 17
 ; RV32I-NEXT:    srli a0, a0, 17
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -525,8 +525,8 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV64I-LABEL: fold_demote_h_s:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srliw a1, a1, 31
-; RV64I-NEXT:    slli a1, a1, 15
 ; RV64I-NEXT:    slli a0, a0, 49
+; RV64I-NEXT:    slli a1, a1, 15
 ; RV64I-NEXT:    srli a0, a0, 49
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -537,8 +537,8 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV32IF-NEXT:    fmv.x.w a1, fa1
 ; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    and a1, a1, a2
-; RV32IF-NEXT:    srli a1, a1, 16
 ; RV32IF-NEXT:    slli a0, a0, 17
+; RV32IF-NEXT:    srli a1, a1, 16
 ; RV32IF-NEXT:    srli a0, a0, 17
 ; RV32IF-NEXT:    or a0, a0, a1
 ; RV32IF-NEXT:    lui a1, 1048560
@@ -552,8 +552,8 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV32IFD-NEXT:    fmv.x.w a1, fa1
 ; RV32IFD-NEXT:    lui a2, 524288
 ; RV32IFD-NEXT:    and a1, a1, a2
-; RV32IFD-NEXT:    srli a1, a1, 16
 ; RV32IFD-NEXT:    slli a0, a0, 17
+; RV32IFD-NEXT:    srli a1, a1, 16
 ; RV32IFD-NEXT:    srli a0, a0, 17
 ; RV32IFD-NEXT:    or a0, a0, a1
 ; RV32IFD-NEXT:    lui a1, 1048560
@@ -567,8 +567,8 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV64IFD-NEXT:    fmv.x.w a1, fa1
 ; RV64IFD-NEXT:    lui a2, 524288
 ; RV64IFD-NEXT:    and a1, a1, a2
-; RV64IFD-NEXT:    srli a1, a1, 16
 ; RV64IFD-NEXT:    slli a0, a0, 49
+; RV64IFD-NEXT:    srli a1, a1, 16
 ; RV64IFD-NEXT:    srli a0, a0, 49
 ; RV64IFD-NEXT:    or a0, a0, a1
 ; RV64IFD-NEXT:    lui a1, 1048560
@@ -597,10 +597,10 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV32IFZFHMIN-LABEL: fold_demote_h_s:
 ; RV32IFZFHMIN:       # %bb.0:
 ; RV32IFZFHMIN-NEXT:    fmv.x.w a0, fa1
-; RV32IFZFHMIN-NEXT:    srli a0, a0, 31
-; RV32IFZFHMIN-NEXT:    slli a0, a0, 15
 ; RV32IFZFHMIN-NEXT:    fmv.x.h a1, fa0
+; RV32IFZFHMIN-NEXT:    srli a0, a0, 31
 ; RV32IFZFHMIN-NEXT:    slli a1, a1, 17
+; RV32IFZFHMIN-NEXT:    slli a0, a0, 15
 ; RV32IFZFHMIN-NEXT:    srli a1, a1, 17
 ; RV32IFZFHMIN-NEXT:    or a0, a1, a0
 ; RV32IFZFHMIN-NEXT:    fmv.h.x fa0, a0
@@ -609,10 +609,10 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV32IFDZFHMIN-LABEL: fold_demote_h_s:
 ; RV32IFDZFHMIN:       # %bb.0:
 ; RV32IFDZFHMIN-NEXT:    fmv.x.w a0, fa1
-; RV32IFDZFHMIN-NEXT:    srli a0, a0, 31
-; RV32IFDZFHMIN-NEXT:    slli a0, a0, 15
 ; RV32IFDZFHMIN-NEXT:    fmv.x.h a1, fa0
+; RV32IFDZFHMIN-NEXT:    srli a0, a0, 31
 ; RV32IFDZFHMIN-NEXT:    slli a1, a1, 17
+; RV32IFDZFHMIN-NEXT:    slli a0, a0, 15
 ; RV32IFDZFHMIN-NEXT:    srli a1, a1, 17
 ; RV32IFDZFHMIN-NEXT:    or a0, a1, a0
 ; RV32IFDZFHMIN-NEXT:    fmv.h.x fa0, a0
@@ -621,10 +621,10 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV64IFDZFHMIN-LABEL: fold_demote_h_s:
 ; RV64IFDZFHMIN:       # %bb.0:
 ; RV64IFDZFHMIN-NEXT:    fmv.x.w a0, fa1
-; RV64IFDZFHMIN-NEXT:    srli a0, a0, 31
-; RV64IFDZFHMIN-NEXT:    slli a0, a0, 15
 ; RV64IFDZFHMIN-NEXT:    fmv.x.h a1, fa0
+; RV64IFDZFHMIN-NEXT:    srli a0, a0, 31
 ; RV64IFDZFHMIN-NEXT:    slli a1, a1, 49
+; RV64IFDZFHMIN-NEXT:    slli a0, a0, 15
 ; RV64IFDZFHMIN-NEXT:    srli a1, a1, 49
 ; RV64IFDZFHMIN-NEXT:    or a0, a1, a0
 ; RV64IFDZFHMIN-NEXT:    fmv.h.x fa0, a0
@@ -635,11 +635,11 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV32IZDINX-NEXT:    # kill: def $x11_w killed $x11_w def $x11
 ; RV32IZDINX-NEXT:    # kill: def $x10_w killed $x10_w def $x10
 ; RV32IZDINX-NEXT:    lui a2, 524288
-; RV32IZDINX-NEXT:    and a1, a1, a2
-; RV32IZDINX-NEXT:    srli a1, a1, 16
 ; RV32IZDINX-NEXT:    slli a0, a0, 17
-; RV32IZDINX-NEXT:    srli a0, a0, 17
+; RV32IZDINX-NEXT:    and a1, a1, a2
 ; RV32IZDINX-NEXT:    lui a2, 1048560
+; RV32IZDINX-NEXT:    srli a0, a0, 17
+; RV32IZDINX-NEXT:    srli a1, a1, 16
 ; RV32IZDINX-NEXT:    or a0, a0, a2
 ; RV32IZDINX-NEXT:    or a0, a0, a1
 ; RV32IZDINX-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
@@ -650,11 +650,11 @@ define half @fold_demote_h_s(half %a, float %b) nounwind {
 ; RV64IZDINX-NEXT:    # kill: def $x11_w killed $x11_w def $x11
 ; RV64IZDINX-NEXT:    # kill: def $x10_w killed $x10_w def $x10
 ; RV64IZDINX-NEXT:    lui a2, 524288
-; RV64IZDINX-NEXT:    and a1, a1, a2
-; RV64IZDINX-NEXT:    srli a1, a1, 16
 ; RV64IZDINX-NEXT:    slli a0, a0, 49
-; RV64IZDINX-NEXT:    srli a0, a0, 49
+; RV64IZDINX-NEXT:    and a1, a1, a2
 ; RV64IZDINX-NEXT:    lui a2, 1048560
+; RV64IZDINX-NEXT:    srli a0, a0, 49
+; RV64IZDINX-NEXT:    srli a1, a1, 16
 ; RV64IZDINX-NEXT:    or a0, a0, a2
 ; RV64IZDINX-NEXT:    or a0, a0, a1
 ; RV64IZDINX-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
@@ -668,9 +668,9 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV32I-LABEL: fold_demote_h_d:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a1, 524288
+; RV32I-NEXT:    slli a0, a0, 17
 ; RV32I-NEXT:    and a1, a2, a1
 ; RV32I-NEXT:    srli a1, a1, 16
-; RV32I-NEXT:    slli a0, a0, 17
 ; RV32I-NEXT:    srli a0, a0, 17
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -678,8 +678,8 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV64I-LABEL: fold_demote_h_d:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 49
-; RV64I-NEXT:    srli a0, a0, 49
 ; RV64I-NEXT:    srli a1, a1, 63
+; RV64I-NEXT:    srli a0, a0, 49
 ; RV64I-NEXT:    slli a1, a1, 63
 ; RV64I-NEXT:    srli a1, a1, 48
 ; RV64I-NEXT:    or a0, a0, a1
@@ -690,8 +690,8 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV32IF-NEXT:    fmv.x.w a0, fa0
 ; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    and a1, a1, a2
-; RV32IF-NEXT:    srli a1, a1, 16
 ; RV32IF-NEXT:    slli a0, a0, 17
+; RV32IF-NEXT:    srli a1, a1, 16
 ; RV32IF-NEXT:    srli a0, a0, 17
 ; RV32IF-NEXT:    or a0, a0, a1
 ; RV32IF-NEXT:    lui a1, 1048560
@@ -707,10 +707,10 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV32IFD-NEXT:    fmv.x.w a1, fa0
 ; RV32IFD-NEXT:    lui a2, 524288
 ; RV32IFD-NEXT:    and a0, a0, a2
-; RV32IFD-NEXT:    srli a0, a0, 16
+; RV32IFD-NEXT:    lui a2, 1048560
 ; RV32IFD-NEXT:    slli a1, a1, 17
 ; RV32IFD-NEXT:    srli a1, a1, 17
-; RV32IFD-NEXT:    lui a2, 1048560
+; RV32IFD-NEXT:    srli a0, a0, 16
 ; RV32IFD-NEXT:    or a1, a1, a2
 ; RV32IFD-NEXT:    or a0, a1, a0
 ; RV32IFD-NEXT:    fmv.w.x fa0, a0
@@ -721,12 +721,12 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV64IFD:       # %bb.0:
 ; RV64IFD-NEXT:    fmv.x.d a0, fa1
 ; RV64IFD-NEXT:    fmv.x.w a1, fa0
+; RV64IFD-NEXT:    lui a2, 1048560
 ; RV64IFD-NEXT:    slli a1, a1, 49
-; RV64IFD-NEXT:    srli a1, a1, 49
 ; RV64IFD-NEXT:    srli a0, a0, 63
+; RV64IFD-NEXT:    srli a1, a1, 49
 ; RV64IFD-NEXT:    slli a0, a0, 63
 ; RV64IFD-NEXT:    srli a0, a0, 48
-; RV64IFD-NEXT:    lui a2, 1048560
 ; RV64IFD-NEXT:    or a1, a1, a2
 ; RV64IFD-NEXT:    or a0, a1, a0
 ; RV64IFD-NEXT:    fmv.w.x fa0, a0
@@ -754,8 +754,8 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV32IFZFHMIN-LABEL: fold_demote_h_d:
 ; RV32IFZFHMIN:       # %bb.0:
 ; RV32IFZFHMIN-NEXT:    srli a1, a1, 31
-; RV32IFZFHMIN-NEXT:    slli a1, a1, 15
 ; RV32IFZFHMIN-NEXT:    fmv.x.h a0, fa0
+; RV32IFZFHMIN-NEXT:    slli a1, a1, 15
 ; RV32IFZFHMIN-NEXT:    slli a0, a0, 17
 ; RV32IFZFHMIN-NEXT:    srli a0, a0, 17
 ; RV32IFZFHMIN-NEXT:    or a0, a0, a1
@@ -767,10 +767,10 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV32IFDZFHMIN-NEXT:    addi sp, sp, -16
 ; RV32IFDZFHMIN-NEXT:    fsd fa1, 8(sp)
 ; RV32IFDZFHMIN-NEXT:    lw a0, 12(sp)
-; RV32IFDZFHMIN-NEXT:    srli a0, a0, 31
-; RV32IFDZFHMIN-NEXT:    slli a0, a0, 15
 ; RV32IFDZFHMIN-NEXT:    fmv.x.h a1, fa0
 ; RV32IFDZFHMIN-NEXT:    slli a1, a1, 17
+; RV32IFDZFHMIN-NEXT:    srli a0, a0, 31
+; RV32IFDZFHMIN-NEXT:    slli a0, a0, 15
 ; RV32IFDZFHMIN-NEXT:    srli a1, a1, 17
 ; RV32IFDZFHMIN-NEXT:    or a0, a1, a0
 ; RV32IFDZFHMIN-NEXT:    fmv.h.x fa0, a0
@@ -780,10 +780,10 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV64IFDZFHMIN-LABEL: fold_demote_h_d:
 ; RV64IFDZFHMIN:       # %bb.0:
 ; RV64IFDZFHMIN-NEXT:    fmv.x.d a0, fa1
-; RV64IFDZFHMIN-NEXT:    srli a0, a0, 63
-; RV64IFDZFHMIN-NEXT:    slli a0, a0, 15
 ; RV64IFDZFHMIN-NEXT:    fmv.x.h a1, fa0
+; RV64IFDZFHMIN-NEXT:    srli a0, a0, 63
 ; RV64IFDZFHMIN-NEXT:    slli a1, a1, 49
+; RV64IFDZFHMIN-NEXT:    slli a0, a0, 15
 ; RV64IFDZFHMIN-NEXT:    srli a1, a1, 49
 ; RV64IFDZFHMIN-NEXT:    or a0, a1, a0
 ; RV64IFDZFHMIN-NEXT:    fmv.h.x fa0, a0
@@ -793,11 +793,11 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV32IZDINX:       # %bb.0:
 ; RV32IZDINX-NEXT:    # kill: def $x10_w killed $x10_w def $x10
 ; RV32IZDINX-NEXT:    lui a1, 524288
-; RV32IZDINX-NEXT:    and a1, a2, a1
-; RV32IZDINX-NEXT:    srli a1, a1, 16
 ; RV32IZDINX-NEXT:    slli a0, a0, 17
-; RV32IZDINX-NEXT:    srli a0, a0, 17
+; RV32IZDINX-NEXT:    and a1, a2, a1
 ; RV32IZDINX-NEXT:    lui a2, 1048560
+; RV32IZDINX-NEXT:    srli a0, a0, 17
+; RV32IZDINX-NEXT:    srli a1, a1, 16
 ; RV32IZDINX-NEXT:    or a0, a0, a2
 ; RV32IZDINX-NEXT:    or a0, a0, a1
 ; RV32IZDINX-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
@@ -807,11 +807,11 @@ define half @fold_demote_h_d(half %a, double %b) nounwind {
 ; RV64IZDINX:       # %bb.0:
 ; RV64IZDINX-NEXT:    # kill: def $x10_w killed $x10_w def $x10
 ; RV64IZDINX-NEXT:    slli a0, a0, 49
-; RV64IZDINX-NEXT:    srli a0, a0, 49
 ; RV64IZDINX-NEXT:    srli a1, a1, 63
+; RV64IZDINX-NEXT:    lui a2, 1048560
+; RV64IZDINX-NEXT:    srli a0, a0, 49
 ; RV64IZDINX-NEXT:    slli a1, a1, 63
 ; RV64IZDINX-NEXT:    srli a1, a1, 48
-; RV64IZDINX-NEXT:    lui a2, 1048560
 ; RV64IZDINX-NEXT:    or a0, a0, a2
 ; RV64IZDINX-NEXT:    or a0, a0, a1
 ; RV64IZDINX-NEXT:    # kill: def $x10_w killed $x10_w killed $x10

diff  --git a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
index 2c691a2de4c4de..da97ac0d742379 100644
--- a/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
@@ -131,18 +131,18 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
 ; RV32_NOZBB-NEXT:  # %bb.1: # %cond.false
 ; RV32_NOZBB-NEXT:    addi a1, a0, -1
 ; RV32_NOZBB-NEXT:    not a0, a0
-; RV32_NOZBB-NEXT:    and a0, a0, a1
-; RV32_NOZBB-NEXT:    srli a1, a0, 1
 ; RV32_NOZBB-NEXT:    lui a2, 5
-; RV32_NOZBB-NEXT:    addi a2, a2, 1365
-; RV32_NOZBB-NEXT:    and a1, a1, a2
+; RV32_NOZBB-NEXT:    and a0, a0, a1
+; RV32_NOZBB-NEXT:    addi a1, a2, 1365
+; RV32_NOZBB-NEXT:    srli a2, a0, 1
+; RV32_NOZBB-NEXT:    and a1, a2, a1
+; RV32_NOZBB-NEXT:    lui a2, 3
+; RV32_NOZBB-NEXT:    addi a2, a2, 819
 ; RV32_NOZBB-NEXT:    sub a0, a0, a1
-; RV32_NOZBB-NEXT:    lui a1, 3
-; RV32_NOZBB-NEXT:    addi a1, a1, 819
-; RV32_NOZBB-NEXT:    and a2, a0, a1
+; RV32_NOZBB-NEXT:    and a1, a0, a2
 ; RV32_NOZBB-NEXT:    srli a0, a0, 2
-; RV32_NOZBB-NEXT:    and a0, a0, a1
-; RV32_NOZBB-NEXT:    add a0, a2, a0
+; RV32_NOZBB-NEXT:    and a0, a0, a2
+; RV32_NOZBB-NEXT:    add a0, a1, a0
 ; RV32_NOZBB-NEXT:    srli a1, a0, 4
 ; RV32_NOZBB-NEXT:    add a0, a0, a1
 ; RV32_NOZBB-NEXT:    andi a1, a0, 15
@@ -161,18 +161,18 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
 ; RV64NOZBB-NEXT:  # %bb.1: # %cond.false
 ; RV64NOZBB-NEXT:    addi a1, a0, -1
 ; RV64NOZBB-NEXT:    not a0, a0
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    srli a1, a0, 1
 ; RV64NOZBB-NEXT:    lui a2, 5
-; RV64NOZBB-NEXT:    addiw a2, a2, 1365
-; RV64NOZBB-NEXT:    and a1, a1, a2
+; RV64NOZBB-NEXT:    and a0, a0, a1
+; RV64NOZBB-NEXT:    addiw a1, a2, 1365
+; RV64NOZBB-NEXT:    srli a2, a0, 1
+; RV64NOZBB-NEXT:    and a1, a2, a1
+; RV64NOZBB-NEXT:    lui a2, 3
+; RV64NOZBB-NEXT:    addiw a2, a2, 819
 ; RV64NOZBB-NEXT:    sub a0, a0, a1
-; RV64NOZBB-NEXT:    lui a1, 3
-; RV64NOZBB-NEXT:    addiw a1, a1, 819
-; RV64NOZBB-NEXT:    and a2, a0, a1
+; RV64NOZBB-NEXT:    and a1, a0, a2
 ; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    add a0, a2, a0
+; RV64NOZBB-NEXT:    and a0, a0, a2
+; RV64NOZBB-NEXT:    add a0, a1, a0
 ; RV64NOZBB-NEXT:    srli a1, a0, 4
 ; RV64NOZBB-NEXT:    add a0, a0, a1
 ; RV64NOZBB-NEXT:    andi a1, a0, 15
@@ -620,18 +620,18 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind {
 ; RV32_NOZBB:       # %bb.0:
 ; RV32_NOZBB-NEXT:    addi a1, a0, -1
 ; RV32_NOZBB-NEXT:    not a0, a0
-; RV32_NOZBB-NEXT:    and a0, a0, a1
-; RV32_NOZBB-NEXT:    srli a1, a0, 1
 ; RV32_NOZBB-NEXT:    lui a2, 5
-; RV32_NOZBB-NEXT:    addi a2, a2, 1365
-; RV32_NOZBB-NEXT:    and a1, a1, a2
+; RV32_NOZBB-NEXT:    and a0, a0, a1
+; RV32_NOZBB-NEXT:    addi a1, a2, 1365
+; RV32_NOZBB-NEXT:    srli a2, a0, 1
+; RV32_NOZBB-NEXT:    and a1, a2, a1
+; RV32_NOZBB-NEXT:    lui a2, 3
+; RV32_NOZBB-NEXT:    addi a2, a2, 819
 ; RV32_NOZBB-NEXT:    sub a0, a0, a1
-; RV32_NOZBB-NEXT:    lui a1, 3
-; RV32_NOZBB-NEXT:    addi a1, a1, 819
-; RV32_NOZBB-NEXT:    and a2, a0, a1
+; RV32_NOZBB-NEXT:    and a1, a0, a2
 ; RV32_NOZBB-NEXT:    srli a0, a0, 2
-; RV32_NOZBB-NEXT:    and a0, a0, a1
-; RV32_NOZBB-NEXT:    add a0, a2, a0
+; RV32_NOZBB-NEXT:    and a0, a0, a2
+; RV32_NOZBB-NEXT:    add a0, a1, a0
 ; RV32_NOZBB-NEXT:    srli a1, a0, 4
 ; RV32_NOZBB-NEXT:    add a0, a0, a1
 ; RV32_NOZBB-NEXT:    andi a1, a0, 15
@@ -644,18 +644,18 @@ define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind {
 ; RV64NOZBB:       # %bb.0:
 ; RV64NOZBB-NEXT:    addi a1, a0, -1
 ; RV64NOZBB-NEXT:    not a0, a0
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    srli a1, a0, 1
 ; RV64NOZBB-NEXT:    lui a2, 5
-; RV64NOZBB-NEXT:    addiw a2, a2, 1365
-; RV64NOZBB-NEXT:    and a1, a1, a2
+; RV64NOZBB-NEXT:    and a0, a0, a1
+; RV64NOZBB-NEXT:    addiw a1, a2, 1365
+; RV64NOZBB-NEXT:    srli a2, a0, 1
+; RV64NOZBB-NEXT:    and a1, a2, a1
+; RV64NOZBB-NEXT:    lui a2, 3
+; RV64NOZBB-NEXT:    addiw a2, a2, 819
 ; RV64NOZBB-NEXT:    sub a0, a0, a1
-; RV64NOZBB-NEXT:    lui a1, 3
-; RV64NOZBB-NEXT:    addiw a1, a1, 819
-; RV64NOZBB-NEXT:    and a2, a0, a1
+; RV64NOZBB-NEXT:    and a1, a0, a2
 ; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    add a0, a2, a0
+; RV64NOZBB-NEXT:    and a0, a0, a2
+; RV64NOZBB-NEXT:    add a0, a1, a0
 ; RV64NOZBB-NEXT:    srli a1, a0, 4
 ; RV64NOZBB-NEXT:    add a0, a0, a1
 ; RV64NOZBB-NEXT:    andi a1, a0, 15
@@ -1052,28 +1052,28 @@ define i16 @test_ctlz_i16(i16 %a) nounwind {
 ; RV32_NOZBB-NEXT:    beqz a1, .LBB9_2
 ; RV32_NOZBB-NEXT:  # %bb.1: # %cond.false
 ; RV32_NOZBB-NEXT:    srli a1, a1, 17
+; RV32_NOZBB-NEXT:    lui a2, 5
 ; RV32_NOZBB-NEXT:    or a0, a0, a1
-; RV32_NOZBB-NEXT:    slli a1, a0, 16
-; RV32_NOZBB-NEXT:    srli a1, a1, 18
-; RV32_NOZBB-NEXT:    or a0, a0, a1
-; RV32_NOZBB-NEXT:    slli a1, a0, 16
-; RV32_NOZBB-NEXT:    srli a1, a1, 20
-; RV32_NOZBB-NEXT:    or a0, a0, a1
-; RV32_NOZBB-NEXT:    slli a1, a0, 16
-; RV32_NOZBB-NEXT:    srli a1, a1, 24
-; RV32_NOZBB-NEXT:    or a0, a0, a1
+; RV32_NOZBB-NEXT:    addi a1, a2, 1365
+; RV32_NOZBB-NEXT:    slli a2, a0, 16
+; RV32_NOZBB-NEXT:    srli a2, a2, 18
+; RV32_NOZBB-NEXT:    or a0, a0, a2
+; RV32_NOZBB-NEXT:    slli a2, a0, 16
+; RV32_NOZBB-NEXT:    srli a2, a2, 20
+; RV32_NOZBB-NEXT:    or a0, a0, a2
+; RV32_NOZBB-NEXT:    slli a2, a0, 16
+; RV32_NOZBB-NEXT:    srli a2, a2, 24
+; RV32_NOZBB-NEXT:    or a0, a0, a2
 ; RV32_NOZBB-NEXT:    not a0, a0
-; RV32_NOZBB-NEXT:    srli a1, a0, 1
-; RV32_NOZBB-NEXT:    lui a2, 5
-; RV32_NOZBB-NEXT:    addi a2, a2, 1365
-; RV32_NOZBB-NEXT:    and a1, a1, a2
+; RV32_NOZBB-NEXT:    srli a2, a0, 1
+; RV32_NOZBB-NEXT:    and a1, a2, a1
+; RV32_NOZBB-NEXT:    lui a2, 3
+; RV32_NOZBB-NEXT:    addi a2, a2, 819
 ; RV32_NOZBB-NEXT:    sub a0, a0, a1
-; RV32_NOZBB-NEXT:    lui a1, 3
-; RV32_NOZBB-NEXT:    addi a1, a1, 819
-; RV32_NOZBB-NEXT:    and a2, a0, a1
+; RV32_NOZBB-NEXT:    and a1, a0, a2
 ; RV32_NOZBB-NEXT:    srli a0, a0, 2
-; RV32_NOZBB-NEXT:    and a0, a0, a1
-; RV32_NOZBB-NEXT:    add a0, a2, a0
+; RV32_NOZBB-NEXT:    and a0, a0, a2
+; RV32_NOZBB-NEXT:    add a0, a1, a0
 ; RV32_NOZBB-NEXT:    srli a1, a0, 4
 ; RV32_NOZBB-NEXT:    add a0, a0, a1
 ; RV32_NOZBB-NEXT:    andi a1, a0, 15
@@ -1091,28 +1091,28 @@ define i16 @test_ctlz_i16(i16 %a) nounwind {
 ; RV64NOZBB-NEXT:    beqz a1, .LBB9_2
 ; RV64NOZBB-NEXT:  # %bb.1: # %cond.false
 ; RV64NOZBB-NEXT:    srli a1, a1, 49
+; RV64NOZBB-NEXT:    lui a2, 5
 ; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    slli a1, a0, 48
-; RV64NOZBB-NEXT:    srli a1, a1, 50
-; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    slli a1, a0, 48
-; RV64NOZBB-NEXT:    srli a1, a1, 52
-; RV64NOZBB-NEXT:    or a0, a0, a1
-; RV64NOZBB-NEXT:    slli a1, a0, 48
-; RV64NOZBB-NEXT:    srli a1, a1, 56
-; RV64NOZBB-NEXT:    or a0, a0, a1
+; RV64NOZBB-NEXT:    addiw a1, a2, 1365
+; RV64NOZBB-NEXT:    slli a2, a0, 48
+; RV64NOZBB-NEXT:    srli a2, a2, 50
+; RV64NOZBB-NEXT:    or a0, a0, a2
+; RV64NOZBB-NEXT:    slli a2, a0, 48
+; RV64NOZBB-NEXT:    srli a2, a2, 52
+; RV64NOZBB-NEXT:    or a0, a0, a2
+; RV64NOZBB-NEXT:    slli a2, a0, 48
+; RV64NOZBB-NEXT:    srli a2, a2, 56
+; RV64NOZBB-NEXT:    or a0, a0, a2
 ; RV64NOZBB-NEXT:    not a0, a0
-; RV64NOZBB-NEXT:    srli a1, a0, 1
-; RV64NOZBB-NEXT:    lui a2, 5
-; RV64NOZBB-NEXT:    addiw a2, a2, 1365
-; RV64NOZBB-NEXT:    and a1, a1, a2
+; RV64NOZBB-NEXT:    srli a2, a0, 1
+; RV64NOZBB-NEXT:    and a1, a2, a1
+; RV64NOZBB-NEXT:    lui a2, 3
+; RV64NOZBB-NEXT:    addiw a2, a2, 819
 ; RV64NOZBB-NEXT:    sub a0, a0, a1
-; RV64NOZBB-NEXT:    lui a1, 3
-; RV64NOZBB-NEXT:    addiw a1, a1, 819
-; RV64NOZBB-NEXT:    and a2, a0, a1
+; RV64NOZBB-NEXT:    and a1, a0, a2
 ; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    add a0, a2, a0
+; RV64NOZBB-NEXT:    and a0, a0, a2
+; RV64NOZBB-NEXT:    add a0, a1, a0
 ; RV64NOZBB-NEXT:    srli a1, a0, 4
 ; RV64NOZBB-NEXT:    add a0, a0, a1
 ; RV64NOZBB-NEXT:    andi a1, a0, 15
@@ -1161,31 +1161,31 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    beqz a0, .LBB10_2
 ; RV32I-NEXT:  # %bb.1: # %cond.false
 ; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 4
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 16
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    addi a1, a2, -241
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
@@ -1203,31 +1203,31 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    beqz a1, .LBB10_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -1244,33 +1244,33 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
 ; RV32M-NEXT:    beqz a0, .LBB10_2
 ; RV32M-NEXT:  # %bb.1: # %cond.false
 ; RV32M-NEXT:    srli a1, a0, 1
+; RV32M-NEXT:    lui a2, 349525
 ; RV32M-NEXT:    or a0, a0, a1
-; RV32M-NEXT:    srli a1, a0, 2
-; RV32M-NEXT:    or a0, a0, a1
-; RV32M-NEXT:    srli a1, a0, 4
-; RV32M-NEXT:    or a0, a0, a1
-; RV32M-NEXT:    srli a1, a0, 8
-; RV32M-NEXT:    or a0, a0, a1
-; RV32M-NEXT:    srli a1, a0, 16
-; RV32M-NEXT:    or a0, a0, a1
+; RV32M-NEXT:    addi a1, a2, 1365
+; RV32M-NEXT:    srli a2, a0, 2
+; RV32M-NEXT:    or a0, a0, a2
+; RV32M-NEXT:    srli a2, a0, 4
+; RV32M-NEXT:    or a0, a0, a2
+; RV32M-NEXT:    srli a2, a0, 8
+; RV32M-NEXT:    or a0, a0, a2
+; RV32M-NEXT:    srli a2, a0, 16
+; RV32M-NEXT:    or a0, a0, a2
 ; RV32M-NEXT:    not a0, a0
-; RV32M-NEXT:    srli a1, a0, 1
-; RV32M-NEXT:    lui a2, 349525
-; RV32M-NEXT:    addi a2, a2, 1365
-; RV32M-NEXT:    and a1, a1, a2
+; RV32M-NEXT:    srli a2, a0, 1
+; RV32M-NEXT:    and a1, a2, a1
+; RV32M-NEXT:    lui a2, 209715
+; RV32M-NEXT:    addi a2, a2, 819
 ; RV32M-NEXT:    sub a0, a0, a1
-; RV32M-NEXT:    lui a1, 209715
-; RV32M-NEXT:    addi a1, a1, 819
-; RV32M-NEXT:    and a2, a0, a1
+; RV32M-NEXT:    and a1, a0, a2
 ; RV32M-NEXT:    srli a0, a0, 2
-; RV32M-NEXT:    and a0, a0, a1
-; RV32M-NEXT:    add a0, a2, a0
+; RV32M-NEXT:    and a0, a0, a2
+; RV32M-NEXT:    lui a2, 61681
+; RV32M-NEXT:    add a0, a1, a0
 ; RV32M-NEXT:    srli a1, a0, 4
 ; RV32M-NEXT:    add a0, a0, a1
-; RV32M-NEXT:    lui a1, 61681
-; RV32M-NEXT:    addi a1, a1, -241
-; RV32M-NEXT:    and a0, a0, a1
 ; RV32M-NEXT:    lui a1, 4112
+; RV32M-NEXT:    addi a2, a2, -241
+; RV32M-NEXT:    and a0, a0, a2
 ; RV32M-NEXT:    addi a1, a1, 257
 ; RV32M-NEXT:    mul a0, a0, a1
 ; RV32M-NEXT:    srli a0, a0, 24
@@ -1285,33 +1285,33 @@ define i32 @test_ctlz_i32(i32 %a) nounwind {
 ; RV64M-NEXT:    beqz a1, .LBB10_2
 ; RV64M-NEXT:  # %bb.1: # %cond.false
 ; RV64M-NEXT:    srliw a1, a0, 1
+; RV64M-NEXT:    lui a2, 349525
 ; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 2
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 4
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 8
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 16
-; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    addiw a1, a2, 1365
+; RV64M-NEXT:    srliw a2, a0, 2
+; RV64M-NEXT:    or a0, a0, a2
+; RV64M-NEXT:    srliw a2, a0, 4
+; RV64M-NEXT:    or a0, a0, a2
+; RV64M-NEXT:    srliw a2, a0, 8
+; RV64M-NEXT:    or a0, a0, a2
+; RV64M-NEXT:    srliw a2, a0, 16
+; RV64M-NEXT:    or a0, a0, a2
 ; RV64M-NEXT:    not a0, a0
-; RV64M-NEXT:    srli a1, a0, 1
-; RV64M-NEXT:    lui a2, 349525
-; RV64M-NEXT:    addiw a2, a2, 1365
-; RV64M-NEXT:    and a1, a1, a2
+; RV64M-NEXT:    srli a2, a0, 1
+; RV64M-NEXT:    and a1, a2, a1
+; RV64M-NEXT:    lui a2, 209715
+; RV64M-NEXT:    addiw a2, a2, 819
 ; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    and a2, a0, a1
+; RV64M-NEXT:    and a1, a0, a2
 ; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    lui a2, 61681
+; RV64M-NEXT:    add a0, a1, a0
 ; RV64M-NEXT:    srli a1, a0, 4
 ; RV64M-NEXT:    add a0, a0, a1
-; RV64M-NEXT:    lui a1, 61681
-; RV64M-NEXT:    addi a1, a1, -241
-; RV64M-NEXT:    and a0, a0, a1
 ; RV64M-NEXT:    lui a1, 4112
+; RV64M-NEXT:    addi a2, a2, -241
+; RV64M-NEXT:    and a0, a0, a2
 ; RV64M-NEXT:    addi a1, a1, 257
 ; RV64M-NEXT:    mul a0, a0, a1
 ; RV64M-NEXT:    srliw a0, a0, 24
@@ -1349,11 +1349,11 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV32I-LABEL: test_ctlz_i64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    lui a5, 61681
 ; RV32I-NEXT:    addi a4, a2, 1365
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a3, a2, 819
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    addi a3, a3, 819
+; RV32I-NEXT:    addi a2, a5, -241
 ; RV32I-NEXT:    bnez a1, .LBB11_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    srli a1, a0, 1
@@ -1420,40 +1420,40 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    beqz a0, .LBB11_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 32
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addiw a2, a3, 819
+; RV64I-NEXT:    srli a3, a0, 2
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    slli a3, a2, 32
 ; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a3, a0, 4
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 16
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 32
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    addiw a3, a3, -241
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    slli a2, a3, 32
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a2, a3, a2
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 16
@@ -1469,13 +1469,13 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV32M-LABEL: test_ctlz_i64:
 ; RV32M:       # %bb.0:
 ; RV32M-NEXT:    lui a2, 349525
+; RV32M-NEXT:    lui a3, 209715
+; RV32M-NEXT:    lui a6, 61681
+; RV32M-NEXT:    lui a7, 4112
 ; RV32M-NEXT:    addi a5, a2, 1365
-; RV32M-NEXT:    lui a2, 209715
-; RV32M-NEXT:    addi a4, a2, 819
-; RV32M-NEXT:    lui a2, 61681
-; RV32M-NEXT:    addi a2, a2, -241
-; RV32M-NEXT:    lui a3, 4112
-; RV32M-NEXT:    addi a3, a3, 257
+; RV32M-NEXT:    addi a4, a3, 819
+; RV32M-NEXT:    addi a3, a6, -241
+; RV32M-NEXT:    addi a2, a7, 257
 ; RV32M-NEXT:    bnez a1, .LBB11_2
 ; RV32M-NEXT:  # %bb.1:
 ; RV32M-NEXT:    srli a1, a0, 1
@@ -1498,8 +1498,8 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV32M-NEXT:    add a0, a1, a0
 ; RV32M-NEXT:    srli a1, a0, 4
 ; RV32M-NEXT:    add a0, a0, a1
-; RV32M-NEXT:    and a0, a0, a2
-; RV32M-NEXT:    mul a0, a0, a3
+; RV32M-NEXT:    and a0, a0, a3
+; RV32M-NEXT:    mul a0, a0, a2
 ; RV32M-NEXT:    srli a0, a0, 24
 ; RV32M-NEXT:    addi a0, a0, 32
 ; RV32M-NEXT:    li a1, 0
@@ -1525,8 +1525,8 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV32M-NEXT:    add a0, a1, a0
 ; RV32M-NEXT:    srli a1, a0, 4
 ; RV32M-NEXT:    add a0, a0, a1
-; RV32M-NEXT:    and a0, a0, a2
-; RV32M-NEXT:    mul a0, a0, a3
+; RV32M-NEXT:    and a0, a0, a3
+; RV32M-NEXT:    mul a0, a0, a2
 ; RV32M-NEXT:    srli a0, a0, 24
 ; RV32M-NEXT:    li a1, 0
 ; RV32M-NEXT:    ret
@@ -1536,44 +1536,44 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
 ; RV64M-NEXT:    beqz a0, .LBB11_2
 ; RV64M-NEXT:  # %bb.1: # %cond.false
 ; RV64M-NEXT:    srli a1, a0, 1
+; RV64M-NEXT:    lui a2, 349525
+; RV64M-NEXT:    lui a3, 209715
+; RV64M-NEXT:    lui a4, 61681
 ; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 2
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 4
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 8
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 16
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 32
-; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    addiw a1, a2, 1365
+; RV64M-NEXT:    addiw a2, a3, 819
+; RV64M-NEXT:    addiw a3, a4, -241
+; RV64M-NEXT:    srli a4, a0, 2
+; RV64M-NEXT:    or a0, a0, a4
+; RV64M-NEXT:    slli a4, a1, 32
+; RV64M-NEXT:    add a1, a1, a4
+; RV64M-NEXT:    slli a4, a2, 32
+; RV64M-NEXT:    add a2, a2, a4
+; RV64M-NEXT:    slli a4, a3, 32
+; RV64M-NEXT:    add a3, a3, a4
+; RV64M-NEXT:    srli a4, a0, 4
+; RV64M-NEXT:    or a0, a0, a4
+; RV64M-NEXT:    srli a4, a0, 8
+; RV64M-NEXT:    or a0, a0, a4
+; RV64M-NEXT:    srli a4, a0, 16
+; RV64M-NEXT:    or a0, a0, a4
+; RV64M-NEXT:    srli a4, a0, 32
+; RV64M-NEXT:    or a0, a0, a4
 ; RV64M-NEXT:    not a0, a0
-; RV64M-NEXT:    srli a1, a0, 1
-; RV64M-NEXT:    lui a2, 349525
-; RV64M-NEXT:    addiw a2, a2, 1365
-; RV64M-NEXT:    slli a3, a2, 32
-; RV64M-NEXT:    add a2, a2, a3
-; RV64M-NEXT:    and a1, a1, a2
+; RV64M-NEXT:    srli a4, a0, 1
+; RV64M-NEXT:    and a1, a4, a1
 ; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
-; RV64M-NEXT:    and a2, a0, a1
+; RV64M-NEXT:    and a1, a0, a2
 ; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    lui a2, 4112
+; RV64M-NEXT:    addiw a2, a2, 257
+; RV64M-NEXT:    add a0, a1, a0
 ; RV64M-NEXT:    srli a1, a0, 4
 ; RV64M-NEXT:    add a0, a0, a1
-; RV64M-NEXT:    lui a1, 61681
-; RV64M-NEXT:    addiw a1, a1, -241
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    lui a1, 4112
-; RV64M-NEXT:    addiw a1, a1, 257
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a1, a2, 32
+; RV64M-NEXT:    and a0, a0, a3
+; RV64M-NEXT:    add a1, a2, a1
 ; RV64M-NEXT:    mul a0, a0, a1
 ; RV64M-NEXT:    srli a0, a0, 56
 ; RV64M-NEXT:    ret
@@ -1700,7 +1700,9 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
 ; RV32_NOZBB-LABEL: test_ctlz_i16_zero_undef:
 ; RV32_NOZBB:       # %bb.0:
 ; RV32_NOZBB-NEXT:    slli a1, a0, 16
+; RV32_NOZBB-NEXT:    lui a2, 5
 ; RV32_NOZBB-NEXT:    srli a1, a1, 17
+; RV32_NOZBB-NEXT:    addi a2, a2, 1365
 ; RV32_NOZBB-NEXT:    or a0, a0, a1
 ; RV32_NOZBB-NEXT:    slli a1, a0, 16
 ; RV32_NOZBB-NEXT:    srli a1, a1, 18
@@ -1713,16 +1715,14 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
 ; RV32_NOZBB-NEXT:    or a0, a0, a1
 ; RV32_NOZBB-NEXT:    not a0, a0
 ; RV32_NOZBB-NEXT:    srli a1, a0, 1
-; RV32_NOZBB-NEXT:    lui a2, 5
-; RV32_NOZBB-NEXT:    addi a2, a2, 1365
 ; RV32_NOZBB-NEXT:    and a1, a1, a2
+; RV32_NOZBB-NEXT:    lui a2, 3
+; RV32_NOZBB-NEXT:    addi a2, a2, 819
 ; RV32_NOZBB-NEXT:    sub a0, a0, a1
-; RV32_NOZBB-NEXT:    lui a1, 3
-; RV32_NOZBB-NEXT:    addi a1, a1, 819
-; RV32_NOZBB-NEXT:    and a2, a0, a1
+; RV32_NOZBB-NEXT:    and a1, a0, a2
 ; RV32_NOZBB-NEXT:    srli a0, a0, 2
-; RV32_NOZBB-NEXT:    and a0, a0, a1
-; RV32_NOZBB-NEXT:    add a0, a2, a0
+; RV32_NOZBB-NEXT:    and a0, a0, a2
+; RV32_NOZBB-NEXT:    add a0, a1, a0
 ; RV32_NOZBB-NEXT:    srli a1, a0, 4
 ; RV32_NOZBB-NEXT:    add a0, a0, a1
 ; RV32_NOZBB-NEXT:    andi a1, a0, 15
@@ -1734,7 +1734,9 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
 ; RV64NOZBB-LABEL: test_ctlz_i16_zero_undef:
 ; RV64NOZBB:       # %bb.0:
 ; RV64NOZBB-NEXT:    slli a1, a0, 48
+; RV64NOZBB-NEXT:    lui a2, 5
 ; RV64NOZBB-NEXT:    srli a1, a1, 49
+; RV64NOZBB-NEXT:    addiw a2, a2, 1365
 ; RV64NOZBB-NEXT:    or a0, a0, a1
 ; RV64NOZBB-NEXT:    slli a1, a0, 48
 ; RV64NOZBB-NEXT:    srli a1, a1, 50
@@ -1747,16 +1749,14 @@ define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
 ; RV64NOZBB-NEXT:    or a0, a0, a1
 ; RV64NOZBB-NEXT:    not a0, a0
 ; RV64NOZBB-NEXT:    srli a1, a0, 1
-; RV64NOZBB-NEXT:    lui a2, 5
-; RV64NOZBB-NEXT:    addiw a2, a2, 1365
 ; RV64NOZBB-NEXT:    and a1, a1, a2
+; RV64NOZBB-NEXT:    lui a2, 3
+; RV64NOZBB-NEXT:    addiw a2, a2, 819
 ; RV64NOZBB-NEXT:    sub a0, a0, a1
-; RV64NOZBB-NEXT:    lui a1, 3
-; RV64NOZBB-NEXT:    addiw a1, a1, 819
-; RV64NOZBB-NEXT:    and a2, a0, a1
+; RV64NOZBB-NEXT:    and a1, a0, a2
 ; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    add a0, a2, a0
+; RV64NOZBB-NEXT:    and a0, a0, a2
+; RV64NOZBB-NEXT:    add a0, a1, a0
 ; RV64NOZBB-NEXT:    srli a1, a0, 4
 ; RV64NOZBB-NEXT:    add a0, a0, a1
 ; RV64NOZBB-NEXT:    andi a1, a0, 15
@@ -1796,31 +1796,31 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
 ; RV32I-LABEL: test_ctlz_i32_zero_undef:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 4
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 16
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    addi a1, a2, -241
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
@@ -1832,31 +1832,31 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
 ; RV64I-LABEL: test_ctlz_i32_zero_undef:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -1868,33 +1868,33 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
 ; RV32M-LABEL: test_ctlz_i32_zero_undef:
 ; RV32M:       # %bb.0:
 ; RV32M-NEXT:    srli a1, a0, 1
+; RV32M-NEXT:    lui a2, 349525
 ; RV32M-NEXT:    or a0, a0, a1
-; RV32M-NEXT:    srli a1, a0, 2
-; RV32M-NEXT:    or a0, a0, a1
-; RV32M-NEXT:    srli a1, a0, 4
-; RV32M-NEXT:    or a0, a0, a1
-; RV32M-NEXT:    srli a1, a0, 8
-; RV32M-NEXT:    or a0, a0, a1
-; RV32M-NEXT:    srli a1, a0, 16
-; RV32M-NEXT:    or a0, a0, a1
+; RV32M-NEXT:    addi a1, a2, 1365
+; RV32M-NEXT:    srli a2, a0, 2
+; RV32M-NEXT:    or a0, a0, a2
+; RV32M-NEXT:    srli a2, a0, 4
+; RV32M-NEXT:    or a0, a0, a2
+; RV32M-NEXT:    srli a2, a0, 8
+; RV32M-NEXT:    or a0, a0, a2
+; RV32M-NEXT:    srli a2, a0, 16
+; RV32M-NEXT:    or a0, a0, a2
 ; RV32M-NEXT:    not a0, a0
-; RV32M-NEXT:    srli a1, a0, 1
-; RV32M-NEXT:    lui a2, 349525
-; RV32M-NEXT:    addi a2, a2, 1365
-; RV32M-NEXT:    and a1, a1, a2
+; RV32M-NEXT:    srli a2, a0, 1
+; RV32M-NEXT:    and a1, a2, a1
+; RV32M-NEXT:    lui a2, 209715
+; RV32M-NEXT:    addi a2, a2, 819
 ; RV32M-NEXT:    sub a0, a0, a1
-; RV32M-NEXT:    lui a1, 209715
-; RV32M-NEXT:    addi a1, a1, 819
-; RV32M-NEXT:    and a2, a0, a1
+; RV32M-NEXT:    and a1, a0, a2
 ; RV32M-NEXT:    srli a0, a0, 2
-; RV32M-NEXT:    and a0, a0, a1
-; RV32M-NEXT:    add a0, a2, a0
+; RV32M-NEXT:    and a0, a0, a2
+; RV32M-NEXT:    lui a2, 61681
+; RV32M-NEXT:    add a0, a1, a0
 ; RV32M-NEXT:    srli a1, a0, 4
 ; RV32M-NEXT:    add a0, a0, a1
-; RV32M-NEXT:    lui a1, 61681
-; RV32M-NEXT:    addi a1, a1, -241
-; RV32M-NEXT:    and a0, a0, a1
 ; RV32M-NEXT:    lui a1, 4112
+; RV32M-NEXT:    addi a2, a2, -241
+; RV32M-NEXT:    and a0, a0, a2
 ; RV32M-NEXT:    addi a1, a1, 257
 ; RV32M-NEXT:    mul a0, a0, a1
 ; RV32M-NEXT:    srli a0, a0, 24
@@ -1903,33 +1903,33 @@ define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
 ; RV64M-LABEL: test_ctlz_i32_zero_undef:
 ; RV64M:       # %bb.0:
 ; RV64M-NEXT:    srliw a1, a0, 1
+; RV64M-NEXT:    lui a2, 349525
 ; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 2
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 4
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 8
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srliw a1, a0, 16
-; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    addiw a1, a2, 1365
+; RV64M-NEXT:    srliw a2, a0, 2
+; RV64M-NEXT:    or a0, a0, a2
+; RV64M-NEXT:    srliw a2, a0, 4
+; RV64M-NEXT:    or a0, a0, a2
+; RV64M-NEXT:    srliw a2, a0, 8
+; RV64M-NEXT:    or a0, a0, a2
+; RV64M-NEXT:    srliw a2, a0, 16
+; RV64M-NEXT:    or a0, a0, a2
 ; RV64M-NEXT:    not a0, a0
-; RV64M-NEXT:    srli a1, a0, 1
-; RV64M-NEXT:    lui a2, 349525
-; RV64M-NEXT:    addiw a2, a2, 1365
-; RV64M-NEXT:    and a1, a1, a2
+; RV64M-NEXT:    srli a2, a0, 1
+; RV64M-NEXT:    and a1, a2, a1
+; RV64M-NEXT:    lui a2, 209715
+; RV64M-NEXT:    addiw a2, a2, 819
 ; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    and a2, a0, a1
+; RV64M-NEXT:    and a1, a0, a2
 ; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    lui a2, 61681
+; RV64M-NEXT:    add a0, a1, a0
 ; RV64M-NEXT:    srli a1, a0, 4
 ; RV64M-NEXT:    add a0, a0, a1
-; RV64M-NEXT:    lui a1, 61681
-; RV64M-NEXT:    addi a1, a1, -241
-; RV64M-NEXT:    and a0, a0, a1
 ; RV64M-NEXT:    lui a1, 4112
+; RV64M-NEXT:    addi a2, a2, -241
+; RV64M-NEXT:    and a0, a0, a2
 ; RV64M-NEXT:    addi a1, a1, 257
 ; RV64M-NEXT:    mul a0, a0, a1
 ; RV64M-NEXT:    srliw a0, a0, 24
@@ -1964,11 +1964,11 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
 ; RV32I-LABEL: test_ctlz_i64_zero_undef:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    lui a5, 61681
 ; RV32I-NEXT:    addi a4, a2, 1365
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a3, a2, 819
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    addi a3, a3, 819
+; RV32I-NEXT:    addi a2, a5, -241
 ; RV32I-NEXT:    bnez a1, .LBB15_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    srli a1, a0, 1
@@ -2033,40 +2033,40 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
 ; RV64I-LABEL: test_ctlz_i64_zero_undef:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 32
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addiw a2, a3, 819
+; RV64I-NEXT:    srli a3, a0, 2
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    slli a3, a2, 32
 ; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a3, a0, 4
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 16
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 32
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    addiw a3, a3, -241
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    slli a2, a3, 32
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a2, a3, a2
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 16
@@ -2079,13 +2079,13 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
 ; RV32M-LABEL: test_ctlz_i64_zero_undef:
 ; RV32M:       # %bb.0:
 ; RV32M-NEXT:    lui a2, 349525
+; RV32M-NEXT:    lui a3, 209715
+; RV32M-NEXT:    lui a6, 61681
+; RV32M-NEXT:    lui a7, 4112
 ; RV32M-NEXT:    addi a5, a2, 1365
-; RV32M-NEXT:    lui a2, 209715
-; RV32M-NEXT:    addi a4, a2, 819
-; RV32M-NEXT:    lui a2, 61681
-; RV32M-NEXT:    addi a2, a2, -241
-; RV32M-NEXT:    lui a3, 4112
-; RV32M-NEXT:    addi a3, a3, 257
+; RV32M-NEXT:    addi a4, a3, 819
+; RV32M-NEXT:    addi a3, a6, -241
+; RV32M-NEXT:    addi a2, a7, 257
 ; RV32M-NEXT:    bnez a1, .LBB15_2
 ; RV32M-NEXT:  # %bb.1:
 ; RV32M-NEXT:    srli a1, a0, 1
@@ -2108,8 +2108,8 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
 ; RV32M-NEXT:    add a0, a1, a0
 ; RV32M-NEXT:    srli a1, a0, 4
 ; RV32M-NEXT:    add a0, a0, a1
-; RV32M-NEXT:    and a0, a0, a2
-; RV32M-NEXT:    mul a0, a0, a3
+; RV32M-NEXT:    and a0, a0, a3
+; RV32M-NEXT:    mul a0, a0, a2
 ; RV32M-NEXT:    srli a0, a0, 24
 ; RV32M-NEXT:    addi a0, a0, 32
 ; RV32M-NEXT:    li a1, 0
@@ -2135,8 +2135,8 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
 ; RV32M-NEXT:    add a0, a1, a0
 ; RV32M-NEXT:    srli a1, a0, 4
 ; RV32M-NEXT:    add a0, a0, a1
-; RV32M-NEXT:    and a0, a0, a2
-; RV32M-NEXT:    mul a0, a0, a3
+; RV32M-NEXT:    and a0, a0, a3
+; RV32M-NEXT:    mul a0, a0, a2
 ; RV32M-NEXT:    srli a0, a0, 24
 ; RV32M-NEXT:    li a1, 0
 ; RV32M-NEXT:    ret
@@ -2144,44 +2144,44 @@ define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
 ; RV64M-LABEL: test_ctlz_i64_zero_undef:
 ; RV64M:       # %bb.0:
 ; RV64M-NEXT:    srli a1, a0, 1
+; RV64M-NEXT:    lui a2, 349525
+; RV64M-NEXT:    lui a3, 209715
+; RV64M-NEXT:    lui a4, 61681
 ; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 2
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 4
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 8
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 16
-; RV64M-NEXT:    or a0, a0, a1
-; RV64M-NEXT:    srli a1, a0, 32
-; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    addiw a1, a2, 1365
+; RV64M-NEXT:    addiw a2, a3, 819
+; RV64M-NEXT:    addiw a3, a4, -241
+; RV64M-NEXT:    srli a4, a0, 2
+; RV64M-NEXT:    or a0, a0, a4
+; RV64M-NEXT:    slli a4, a1, 32
+; RV64M-NEXT:    add a1, a1, a4
+; RV64M-NEXT:    slli a4, a2, 32
+; RV64M-NEXT:    add a2, a2, a4
+; RV64M-NEXT:    slli a4, a3, 32
+; RV64M-NEXT:    add a3, a3, a4
+; RV64M-NEXT:    srli a4, a0, 4
+; RV64M-NEXT:    or a0, a0, a4
+; RV64M-NEXT:    srli a4, a0, 8
+; RV64M-NEXT:    or a0, a0, a4
+; RV64M-NEXT:    srli a4, a0, 16
+; RV64M-NEXT:    or a0, a0, a4
+; RV64M-NEXT:    srli a4, a0, 32
+; RV64M-NEXT:    or a0, a0, a4
 ; RV64M-NEXT:    not a0, a0
-; RV64M-NEXT:    srli a1, a0, 1
-; RV64M-NEXT:    lui a2, 349525
-; RV64M-NEXT:    addiw a2, a2, 1365
-; RV64M-NEXT:    slli a3, a2, 32
-; RV64M-NEXT:    add a2, a2, a3
-; RV64M-NEXT:    and a1, a1, a2
+; RV64M-NEXT:    srli a4, a0, 1
+; RV64M-NEXT:    and a1, a4, a1
 ; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
-; RV64M-NEXT:    and a2, a0, a1
+; RV64M-NEXT:    and a1, a0, a2
 ; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    lui a2, 4112
+; RV64M-NEXT:    addiw a2, a2, 257
+; RV64M-NEXT:    add a0, a1, a0
 ; RV64M-NEXT:    srli a1, a0, 4
 ; RV64M-NEXT:    add a0, a0, a1
-; RV64M-NEXT:    lui a1, 61681
-; RV64M-NEXT:    addiw a1, a1, -241
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    lui a1, 4112
-; RV64M-NEXT:    addiw a1, a1, 257
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a1, a2, 32
+; RV64M-NEXT:    and a0, a0, a3
+; RV64M-NEXT:    add a1, a2, a1
 ; RV64M-NEXT:    mul a0, a0, a1
 ; RV64M-NEXT:    srli a0, a0, 56
 ; RV64M-NEXT:    ret
@@ -2304,13 +2304,13 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
 ; RV32_NOZBB-NEXT:    lui a2, 5
 ; RV32_NOZBB-NEXT:    addi a2, a2, 1365
 ; RV32_NOZBB-NEXT:    and a1, a1, a2
+; RV32_NOZBB-NEXT:    lui a2, 3
+; RV32_NOZBB-NEXT:    addi a2, a2, 819
 ; RV32_NOZBB-NEXT:    sub a0, a0, a1
-; RV32_NOZBB-NEXT:    lui a1, 3
-; RV32_NOZBB-NEXT:    addi a1, a1, 819
-; RV32_NOZBB-NEXT:    and a2, a0, a1
+; RV32_NOZBB-NEXT:    and a1, a0, a2
 ; RV32_NOZBB-NEXT:    srli a0, a0, 2
-; RV32_NOZBB-NEXT:    and a0, a0, a1
-; RV32_NOZBB-NEXT:    add a0, a2, a0
+; RV32_NOZBB-NEXT:    and a0, a0, a2
+; RV32_NOZBB-NEXT:    add a0, a1, a0
 ; RV32_NOZBB-NEXT:    srli a1, a0, 4
 ; RV32_NOZBB-NEXT:    add a0, a0, a1
 ; RV32_NOZBB-NEXT:    andi a1, a0, 15
@@ -2325,13 +2325,13 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
 ; RV64NOZBB-NEXT:    lui a2, 5
 ; RV64NOZBB-NEXT:    addiw a2, a2, 1365
 ; RV64NOZBB-NEXT:    and a1, a1, a2
+; RV64NOZBB-NEXT:    lui a2, 3
+; RV64NOZBB-NEXT:    addiw a2, a2, 819
 ; RV64NOZBB-NEXT:    sub a0, a0, a1
-; RV64NOZBB-NEXT:    lui a1, 3
-; RV64NOZBB-NEXT:    addiw a1, a1, 819
-; RV64NOZBB-NEXT:    and a2, a0, a1
+; RV64NOZBB-NEXT:    and a1, a0, a2
 ; RV64NOZBB-NEXT:    srli a0, a0, 2
-; RV64NOZBB-NEXT:    and a0, a0, a1
-; RV64NOZBB-NEXT:    add a0, a2, a0
+; RV64NOZBB-NEXT:    and a0, a0, a2
+; RV64NOZBB-NEXT:    add a0, a1, a0
 ; RV64NOZBB-NEXT:    srli a1, a0, 4
 ; RV64NOZBB-NEXT:    add a0, a0, a1
 ; RV64NOZBB-NEXT:    andi a1, a0, 15
@@ -2358,13 +2358,13 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
 ; RV32XTHEADBB-NEXT:    lui a2, 5
 ; RV32XTHEADBB-NEXT:    addi a2, a2, 1365
 ; RV32XTHEADBB-NEXT:    and a1, a1, a2
+; RV32XTHEADBB-NEXT:    lui a2, 3
+; RV32XTHEADBB-NEXT:    addi a2, a2, 819
 ; RV32XTHEADBB-NEXT:    sub a0, a0, a1
-; RV32XTHEADBB-NEXT:    lui a1, 3
-; RV32XTHEADBB-NEXT:    addi a1, a1, 819
-; RV32XTHEADBB-NEXT:    and a2, a0, a1
+; RV32XTHEADBB-NEXT:    and a1, a0, a2
 ; RV32XTHEADBB-NEXT:    srli a0, a0, 2
-; RV32XTHEADBB-NEXT:    and a0, a0, a1
-; RV32XTHEADBB-NEXT:    add a0, a2, a0
+; RV32XTHEADBB-NEXT:    and a0, a0, a2
+; RV32XTHEADBB-NEXT:    add a0, a1, a0
 ; RV32XTHEADBB-NEXT:    srli a1, a0, 4
 ; RV32XTHEADBB-NEXT:    add a0, a0, a1
 ; RV32XTHEADBB-NEXT:    th.extu a1, a0, 11, 8
@@ -2378,13 +2378,13 @@ define i16 @test_ctpop_i16(i16 %a) nounwind {
 ; RV64XTHEADBB-NEXT:    lui a2, 5
 ; RV64XTHEADBB-NEXT:    addiw a2, a2, 1365
 ; RV64XTHEADBB-NEXT:    and a1, a1, a2
+; RV64XTHEADBB-NEXT:    lui a2, 3
+; RV64XTHEADBB-NEXT:    addiw a2, a2, 819
 ; RV64XTHEADBB-NEXT:    sub a0, a0, a1
-; RV64XTHEADBB-NEXT:    lui a1, 3
-; RV64XTHEADBB-NEXT:    addiw a1, a1, 819
-; RV64XTHEADBB-NEXT:    and a2, a0, a1
+; RV64XTHEADBB-NEXT:    and a1, a0, a2
 ; RV64XTHEADBB-NEXT:    srli a0, a0, 2
-; RV64XTHEADBB-NEXT:    and a0, a0, a1
-; RV64XTHEADBB-NEXT:    add a0, a2, a0
+; RV64XTHEADBB-NEXT:    and a0, a0, a2
+; RV64XTHEADBB-NEXT:    add a0, a1, a0
 ; RV64XTHEADBB-NEXT:    srli a1, a0, 4
 ; RV64XTHEADBB-NEXT:    add a0, a0, a1
 ; RV64XTHEADBB-NEXT:    th.extu a1, a0, 11, 8
@@ -2402,17 +2402,17 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    addi a2, a2, 1365
 ; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    addi a1, a2, -241
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
@@ -2427,17 +2427,17 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -2452,19 +2452,19 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; RV32M-NEXT:    lui a2, 349525
 ; RV32M-NEXT:    addi a2, a2, 1365
 ; RV32M-NEXT:    and a1, a1, a2
+; RV32M-NEXT:    lui a2, 209715
+; RV32M-NEXT:    addi a2, a2, 819
 ; RV32M-NEXT:    sub a0, a0, a1
-; RV32M-NEXT:    lui a1, 209715
-; RV32M-NEXT:    addi a1, a1, 819
-; RV32M-NEXT:    and a2, a0, a1
+; RV32M-NEXT:    and a1, a0, a2
 ; RV32M-NEXT:    srli a0, a0, 2
-; RV32M-NEXT:    and a0, a0, a1
-; RV32M-NEXT:    add a0, a2, a0
+; RV32M-NEXT:    and a0, a0, a2
+; RV32M-NEXT:    lui a2, 61681
+; RV32M-NEXT:    add a0, a1, a0
 ; RV32M-NEXT:    srli a1, a0, 4
 ; RV32M-NEXT:    add a0, a0, a1
-; RV32M-NEXT:    lui a1, 61681
-; RV32M-NEXT:    addi a1, a1, -241
-; RV32M-NEXT:    and a0, a0, a1
 ; RV32M-NEXT:    lui a1, 4112
+; RV32M-NEXT:    addi a2, a2, -241
+; RV32M-NEXT:    and a0, a0, a2
 ; RV32M-NEXT:    addi a1, a1, 257
 ; RV32M-NEXT:    mul a0, a0, a1
 ; RV32M-NEXT:    srli a0, a0, 24
@@ -2476,19 +2476,19 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; RV64M-NEXT:    lui a2, 349525
 ; RV64M-NEXT:    addiw a2, a2, 1365
 ; RV64M-NEXT:    and a1, a1, a2
+; RV64M-NEXT:    lui a2, 209715
+; RV64M-NEXT:    addiw a2, a2, 819
 ; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    and a2, a0, a1
+; RV64M-NEXT:    and a1, a0, a2
 ; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    lui a2, 61681
+; RV64M-NEXT:    add a0, a1, a0
 ; RV64M-NEXT:    srli a1, a0, 4
 ; RV64M-NEXT:    add a0, a0, a1
-; RV64M-NEXT:    lui a1, 61681
-; RV64M-NEXT:    addi a1, a1, -241
-; RV64M-NEXT:    and a0, a0, a1
 ; RV64M-NEXT:    lui a1, 4112
+; RV64M-NEXT:    addi a2, a2, -241
+; RV64M-NEXT:    and a0, a0, a2
 ; RV64M-NEXT:    addi a1, a1, 257
 ; RV64M-NEXT:    mul a0, a0, a1
 ; RV64M-NEXT:    srliw a0, a0, 24
@@ -2510,17 +2510,17 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; RV32XTHEADBB-NEXT:    lui a2, 349525
 ; RV32XTHEADBB-NEXT:    addi a2, a2, 1365
 ; RV32XTHEADBB-NEXT:    and a1, a1, a2
+; RV32XTHEADBB-NEXT:    lui a2, 209715
+; RV32XTHEADBB-NEXT:    addi a2, a2, 819
 ; RV32XTHEADBB-NEXT:    sub a0, a0, a1
-; RV32XTHEADBB-NEXT:    lui a1, 209715
-; RV32XTHEADBB-NEXT:    addi a1, a1, 819
-; RV32XTHEADBB-NEXT:    and a2, a0, a1
+; RV32XTHEADBB-NEXT:    and a1, a0, a2
 ; RV32XTHEADBB-NEXT:    srli a0, a0, 2
-; RV32XTHEADBB-NEXT:    and a0, a0, a1
-; RV32XTHEADBB-NEXT:    add a0, a2, a0
+; RV32XTHEADBB-NEXT:    and a0, a0, a2
+; RV32XTHEADBB-NEXT:    lui a2, 61681
+; RV32XTHEADBB-NEXT:    add a0, a1, a0
 ; RV32XTHEADBB-NEXT:    srli a1, a0, 4
 ; RV32XTHEADBB-NEXT:    add a0, a0, a1
-; RV32XTHEADBB-NEXT:    lui a1, 61681
-; RV32XTHEADBB-NEXT:    addi a1, a1, -241
+; RV32XTHEADBB-NEXT:    addi a1, a2, -241
 ; RV32XTHEADBB-NEXT:    and a0, a0, a1
 ; RV32XTHEADBB-NEXT:    slli a1, a0, 8
 ; RV32XTHEADBB-NEXT:    add a0, a0, a1
@@ -2535,17 +2535,17 @@ define i32 @test_ctpop_i32(i32 %a) nounwind {
 ; RV64XTHEADBB-NEXT:    lui a2, 349525
 ; RV64XTHEADBB-NEXT:    addiw a2, a2, 1365
 ; RV64XTHEADBB-NEXT:    and a1, a1, a2
+; RV64XTHEADBB-NEXT:    lui a2, 209715
+; RV64XTHEADBB-NEXT:    addiw a2, a2, 819
 ; RV64XTHEADBB-NEXT:    sub a0, a0, a1
-; RV64XTHEADBB-NEXT:    lui a1, 209715
-; RV64XTHEADBB-NEXT:    addiw a1, a1, 819
-; RV64XTHEADBB-NEXT:    and a2, a0, a1
+; RV64XTHEADBB-NEXT:    and a1, a0, a2
 ; RV64XTHEADBB-NEXT:    srli a0, a0, 2
-; RV64XTHEADBB-NEXT:    and a0, a0, a1
-; RV64XTHEADBB-NEXT:    add a0, a2, a0
+; RV64XTHEADBB-NEXT:    and a0, a0, a2
+; RV64XTHEADBB-NEXT:    lui a2, 61681
+; RV64XTHEADBB-NEXT:    add a0, a1, a0
 ; RV64XTHEADBB-NEXT:    srli a1, a0, 4
 ; RV64XTHEADBB-NEXT:    add a0, a0, a1
-; RV64XTHEADBB-NEXT:    lui a1, 61681
-; RV64XTHEADBB-NEXT:    addi a1, a1, -241
+; RV64XTHEADBB-NEXT:    addi a1, a2, -241
 ; RV64XTHEADBB-NEXT:    and a0, a0, a1
 ; RV64XTHEADBB-NEXT:    slli a1, a0, 8
 ; RV64XTHEADBB-NEXT:    add a0, a0, a1
@@ -2562,39 +2562,39 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a1, 1
 ; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a0, 1
 ; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a4, a1, a2
-; RV32I-NEXT:    srli a1, a1, 2
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    add a1, a4, a1
-; RV32I-NEXT:    srli a4, a1, 4
-; RV32I-NEXT:    add a1, a1, a4
-; RV32I-NEXT:    lui a4, 61681
-; RV32I-NEXT:    addi a4, a4, -241
-; RV32I-NEXT:    and a1, a1, a4
-; RV32I-NEXT:    slli a5, a1, 8
-; RV32I-NEXT:    add a1, a1, a5
-; RV32I-NEXT:    slli a5, a1, 16
-; RV32I-NEXT:    add a1, a1, a5
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    srli a5, a0, 1
 ; RV32I-NEXT:    and a3, a5, a3
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a5, a5, -241
+; RV32I-NEXT:    sub a1, a1, a2
 ; RV32I-NEXT:    sub a0, a0, a3
-; RV32I-NEXT:    and a3, a0, a2
+; RV32I-NEXT:    and a2, a1, a4
+; RV32I-NEXT:    srli a1, a1, 2
+; RV32I-NEXT:    and a3, a0, a4
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    add a0, a3, a0
-; RV32I-NEXT:    srli a2, a0, 4
-; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a4
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    slli a2, a0, 8
-; RV32I-NEXT:    add a0, a0, a2
-; RV32I-NEXT:    slli a2, a0, 16
-; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    srli a2, a1, 4
+; RV32I-NEXT:    srli a3, a0, 4
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    add a0, a0, a3
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    and a0, a0, a5
+; RV32I-NEXT:    slli a2, a1, 8
+; RV32I-NEXT:    slli a3, a0, 8
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    add a0, a0, a3
+; RV32I-NEXT:    slli a2, a1, 16
+; RV32I-NEXT:    slli a3, a0, 16
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    add a0, a0, a3
+; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    li a1, 0
@@ -2602,28 +2602,28 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
 ;
 ; RV64I-LABEL: test_ctpop_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    slli a3, a2, 32
 ; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    addiw a3, a3, -241
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    slli a2, a3, 32
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a2, a3, a2
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 16
@@ -2637,35 +2637,35 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
 ; RV32M:       # %bb.0:
 ; RV32M-NEXT:    srli a2, a1, 1
 ; RV32M-NEXT:    lui a3, 349525
+; RV32M-NEXT:    lui a4, 209715
+; RV32M-NEXT:    lui a5, 61681
+; RV32M-NEXT:    srli a6, a0, 1
 ; RV32M-NEXT:    addi a3, a3, 1365
 ; RV32M-NEXT:    and a2, a2, a3
-; RV32M-NEXT:    sub a1, a1, a2
-; RV32M-NEXT:    lui a2, 209715
-; RV32M-NEXT:    addi a2, a2, 819
-; RV32M-NEXT:    and a4, a1, a2
-; RV32M-NEXT:    srli a1, a1, 2
-; RV32M-NEXT:    and a1, a1, a2
-; RV32M-NEXT:    add a1, a4, a1
-; RV32M-NEXT:    srli a4, a1, 4
-; RV32M-NEXT:    add a1, a1, a4
-; RV32M-NEXT:    lui a4, 61681
-; RV32M-NEXT:    addi a4, a4, -241
-; RV32M-NEXT:    and a1, a1, a4
-; RV32M-NEXT:    lui a5, 4112
-; RV32M-NEXT:    addi a5, a5, 257
-; RV32M-NEXT:    mul a1, a1, a5
-; RV32M-NEXT:    srli a1, a1, 24
-; RV32M-NEXT:    srli a6, a0, 1
 ; RV32M-NEXT:    and a3, a6, a3
+; RV32M-NEXT:    lui a6, 4112
+; RV32M-NEXT:    addi a4, a4, 819
+; RV32M-NEXT:    addi a5, a5, -241
+; RV32M-NEXT:    addi a6, a6, 257
+; RV32M-NEXT:    sub a1, a1, a2
 ; RV32M-NEXT:    sub a0, a0, a3
-; RV32M-NEXT:    and a3, a0, a2
+; RV32M-NEXT:    and a2, a1, a4
+; RV32M-NEXT:    srli a1, a1, 2
+; RV32M-NEXT:    and a3, a0, a4
 ; RV32M-NEXT:    srli a0, a0, 2
-; RV32M-NEXT:    and a0, a0, a2
-; RV32M-NEXT:    add a0, a3, a0
-; RV32M-NEXT:    srli a2, a0, 4
-; RV32M-NEXT:    add a0, a0, a2
+; RV32M-NEXT:    and a1, a1, a4
 ; RV32M-NEXT:    and a0, a0, a4
-; RV32M-NEXT:    mul a0, a0, a5
+; RV32M-NEXT:    add a1, a2, a1
+; RV32M-NEXT:    add a0, a3, a0
+; RV32M-NEXT:    srli a2, a1, 4
+; RV32M-NEXT:    srli a3, a0, 4
+; RV32M-NEXT:    add a1, a1, a2
+; RV32M-NEXT:    add a0, a0, a3
+; RV32M-NEXT:    and a1, a1, a5
+; RV32M-NEXT:    and a0, a0, a5
+; RV32M-NEXT:    mul a1, a1, a6
+; RV32M-NEXT:    mul a0, a0, a6
+; RV32M-NEXT:    srli a1, a1, 24
 ; RV32M-NEXT:    srli a0, a0, 24
 ; RV32M-NEXT:    add a0, a0, a1
 ; RV32M-NEXT:    li a1, 0
@@ -2673,32 +2673,32 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
 ;
 ; RV64M-LABEL: test_ctpop_i64:
 ; RV64M:       # %bb.0:
-; RV64M-NEXT:    srli a1, a0, 1
-; RV64M-NEXT:    lui a2, 349525
-; RV64M-NEXT:    addiw a2, a2, 1365
-; RV64M-NEXT:    slli a3, a2, 32
-; RV64M-NEXT:    add a2, a2, a3
-; RV64M-NEXT:    and a1, a1, a2
+; RV64M-NEXT:    lui a1, 349525
+; RV64M-NEXT:    lui a2, 209715
+; RV64M-NEXT:    lui a3, 61681
+; RV64M-NEXT:    addiw a1, a1, 1365
+; RV64M-NEXT:    addiw a2, a2, 819
+; RV64M-NEXT:    addiw a3, a3, -241
+; RV64M-NEXT:    slli a4, a1, 32
+; RV64M-NEXT:    add a1, a1, a4
+; RV64M-NEXT:    slli a4, a2, 32
+; RV64M-NEXT:    add a2, a2, a4
+; RV64M-NEXT:    slli a4, a3, 32
+; RV64M-NEXT:    add a3, a3, a4
+; RV64M-NEXT:    srli a4, a0, 1
+; RV64M-NEXT:    and a1, a4, a1
 ; RV64M-NEXT:    sub a0, a0, a1
-; RV64M-NEXT:    lui a1, 209715
-; RV64M-NEXT:    addiw a1, a1, 819
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
-; RV64M-NEXT:    and a2, a0, a1
+; RV64M-NEXT:    and a1, a0, a2
 ; RV64M-NEXT:    srli a0, a0, 2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    add a0, a2, a0
+; RV64M-NEXT:    and a0, a0, a2
+; RV64M-NEXT:    lui a2, 4112
+; RV64M-NEXT:    addiw a2, a2, 257
+; RV64M-NEXT:    add a0, a1, a0
 ; RV64M-NEXT:    srli a1, a0, 4
 ; RV64M-NEXT:    add a0, a0, a1
-; RV64M-NEXT:    lui a1, 61681
-; RV64M-NEXT:    addiw a1, a1, -241
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
-; RV64M-NEXT:    and a0, a0, a1
-; RV64M-NEXT:    lui a1, 4112
-; RV64M-NEXT:    addiw a1, a1, 257
-; RV64M-NEXT:    slli a2, a1, 32
-; RV64M-NEXT:    add a1, a1, a2
+; RV64M-NEXT:    slli a1, a2, 32
+; RV64M-NEXT:    and a0, a0, a3
+; RV64M-NEXT:    add a1, a2, a1
 ; RV64M-NEXT:    mul a0, a0, a1
 ; RV64M-NEXT:    srli a0, a0, 56
 ; RV64M-NEXT:    ret
@@ -2720,39 +2720,39 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
 ; RV32XTHEADBB:       # %bb.0:
 ; RV32XTHEADBB-NEXT:    srli a2, a1, 1
 ; RV32XTHEADBB-NEXT:    lui a3, 349525
+; RV32XTHEADBB-NEXT:    lui a4, 209715
+; RV32XTHEADBB-NEXT:    srli a5, a0, 1
 ; RV32XTHEADBB-NEXT:    addi a3, a3, 1365
 ; RV32XTHEADBB-NEXT:    and a2, a2, a3
-; RV32XTHEADBB-NEXT:    sub a1, a1, a2
-; RV32XTHEADBB-NEXT:    lui a2, 209715
-; RV32XTHEADBB-NEXT:    addi a2, a2, 819
-; RV32XTHEADBB-NEXT:    and a4, a1, a2
-; RV32XTHEADBB-NEXT:    srli a1, a1, 2
-; RV32XTHEADBB-NEXT:    and a1, a1, a2
-; RV32XTHEADBB-NEXT:    add a1, a4, a1
-; RV32XTHEADBB-NEXT:    srli a4, a1, 4
-; RV32XTHEADBB-NEXT:    add a1, a1, a4
-; RV32XTHEADBB-NEXT:    lui a4, 61681
-; RV32XTHEADBB-NEXT:    addi a4, a4, -241
-; RV32XTHEADBB-NEXT:    and a1, a1, a4
-; RV32XTHEADBB-NEXT:    slli a5, a1, 8
-; RV32XTHEADBB-NEXT:    add a1, a1, a5
-; RV32XTHEADBB-NEXT:    slli a5, a1, 16
-; RV32XTHEADBB-NEXT:    add a1, a1, a5
-; RV32XTHEADBB-NEXT:    srli a1, a1, 24
-; RV32XTHEADBB-NEXT:    srli a5, a0, 1
 ; RV32XTHEADBB-NEXT:    and a3, a5, a3
+; RV32XTHEADBB-NEXT:    lui a5, 61681
+; RV32XTHEADBB-NEXT:    addi a4, a4, 819
+; RV32XTHEADBB-NEXT:    addi a5, a5, -241
+; RV32XTHEADBB-NEXT:    sub a1, a1, a2
 ; RV32XTHEADBB-NEXT:    sub a0, a0, a3
-; RV32XTHEADBB-NEXT:    and a3, a0, a2
+; RV32XTHEADBB-NEXT:    and a2, a1, a4
+; RV32XTHEADBB-NEXT:    srli a1, a1, 2
+; RV32XTHEADBB-NEXT:    and a3, a0, a4
 ; RV32XTHEADBB-NEXT:    srli a0, a0, 2
-; RV32XTHEADBB-NEXT:    and a0, a0, a2
-; RV32XTHEADBB-NEXT:    add a0, a3, a0
-; RV32XTHEADBB-NEXT:    srli a2, a0, 4
-; RV32XTHEADBB-NEXT:    add a0, a0, a2
+; RV32XTHEADBB-NEXT:    and a1, a1, a4
 ; RV32XTHEADBB-NEXT:    and a0, a0, a4
-; RV32XTHEADBB-NEXT:    slli a2, a0, 8
-; RV32XTHEADBB-NEXT:    add a0, a0, a2
-; RV32XTHEADBB-NEXT:    slli a2, a0, 16
-; RV32XTHEADBB-NEXT:    add a0, a0, a2
+; RV32XTHEADBB-NEXT:    add a1, a2, a1
+; RV32XTHEADBB-NEXT:    add a0, a3, a0
+; RV32XTHEADBB-NEXT:    srli a2, a1, 4
+; RV32XTHEADBB-NEXT:    srli a3, a0, 4
+; RV32XTHEADBB-NEXT:    add a1, a1, a2
+; RV32XTHEADBB-NEXT:    add a0, a0, a3
+; RV32XTHEADBB-NEXT:    and a1, a1, a5
+; RV32XTHEADBB-NEXT:    and a0, a0, a5
+; RV32XTHEADBB-NEXT:    slli a2, a1, 8
+; RV32XTHEADBB-NEXT:    slli a3, a0, 8
+; RV32XTHEADBB-NEXT:    add a1, a1, a2
+; RV32XTHEADBB-NEXT:    add a0, a0, a3
+; RV32XTHEADBB-NEXT:    slli a2, a1, 16
+; RV32XTHEADBB-NEXT:    slli a3, a0, 16
+; RV32XTHEADBB-NEXT:    add a1, a1, a2
+; RV32XTHEADBB-NEXT:    add a0, a0, a3
+; RV32XTHEADBB-NEXT:    srli a1, a1, 24
 ; RV32XTHEADBB-NEXT:    srli a0, a0, 24
 ; RV32XTHEADBB-NEXT:    add a0, a0, a1
 ; RV32XTHEADBB-NEXT:    li a1, 0
@@ -2760,28 +2760,28 @@ define i64 @test_ctpop_i64(i64 %a) nounwind {
 ;
 ; RV64XTHEADBB-LABEL: test_ctpop_i64:
 ; RV64XTHEADBB:       # %bb.0:
-; RV64XTHEADBB-NEXT:    srli a1, a0, 1
-; RV64XTHEADBB-NEXT:    lui a2, 349525
-; RV64XTHEADBB-NEXT:    addiw a2, a2, 1365
+; RV64XTHEADBB-NEXT:    lui a1, 349525
+; RV64XTHEADBB-NEXT:    lui a2, 209715
+; RV64XTHEADBB-NEXT:    addiw a1, a1, 1365
+; RV64XTHEADBB-NEXT:    addiw a2, a2, 819
+; RV64XTHEADBB-NEXT:    slli a3, a1, 32
+; RV64XTHEADBB-NEXT:    add a1, a1, a3
 ; RV64XTHEADBB-NEXT:    slli a3, a2, 32
 ; RV64XTHEADBB-NEXT:    add a2, a2, a3
-; RV64XTHEADBB-NEXT:    and a1, a1, a2
+; RV64XTHEADBB-NEXT:    srli a3, a0, 1
+; RV64XTHEADBB-NEXT:    and a1, a3, a1
+; RV64XTHEADBB-NEXT:    lui a3, 61681
+; RV64XTHEADBB-NEXT:    addiw a3, a3, -241
 ; RV64XTHEADBB-NEXT:    sub a0, a0, a1
-; RV64XTHEADBB-NEXT:    lui a1, 209715
-; RV64XTHEADBB-NEXT:    addiw a1, a1, 819
-; RV64XTHEADBB-NEXT:    slli a2, a1, 32
-; RV64XTHEADBB-NEXT:    add a1, a1, a2
-; RV64XTHEADBB-NEXT:    and a2, a0, a1
+; RV64XTHEADBB-NEXT:    and a1, a0, a2
 ; RV64XTHEADBB-NEXT:    srli a0, a0, 2
-; RV64XTHEADBB-NEXT:    and a0, a0, a1
-; RV64XTHEADBB-NEXT:    add a0, a2, a0
+; RV64XTHEADBB-NEXT:    and a0, a0, a2
+; RV64XTHEADBB-NEXT:    slli a2, a3, 32
+; RV64XTHEADBB-NEXT:    add a0, a1, a0
 ; RV64XTHEADBB-NEXT:    srli a1, a0, 4
 ; RV64XTHEADBB-NEXT:    add a0, a0, a1
-; RV64XTHEADBB-NEXT:    lui a1, 61681
-; RV64XTHEADBB-NEXT:    addiw a1, a1, -241
-; RV64XTHEADBB-NEXT:    slli a2, a1, 32
-; RV64XTHEADBB-NEXT:    add a1, a1, a2
-; RV64XTHEADBB-NEXT:    and a0, a0, a1
+; RV64XTHEADBB-NEXT:    add a2, a3, a2
+; RV64XTHEADBB-NEXT:    and a0, a0, a2
 ; RV64XTHEADBB-NEXT:    slli a1, a0, 8
 ; RV64XTHEADBB-NEXT:    add a0, a0, a1
 ; RV64XTHEADBB-NEXT:    slli a1, a0, 16

diff  --git a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
index fe6e20d852d590..03a6a6b1c4b7da 100644
--- a/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
+++ b/llvm/test/CodeGen/RISCV/ctz_zero_return_test.ll
@@ -603,11 +603,11 @@ define signext i32 @ctlz(i64 %b) nounwind {
 ; RV32I-LABEL: ctlz:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    lui a5, 61681
 ; RV32I-NEXT:    addi a4, a2, 1365
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a3, a2, 819
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    addi a3, a3, 819
+; RV32I-NEXT:    addi a2, a5, -241
 ; RV32I-NEXT:    bnez a1, .LBB7_2
 ; RV32I-NEXT:  # %bb.1: # %entry
 ; RV32I-NEXT:    srli a1, a0, 1
@@ -672,40 +672,40 @@ define signext i32 @ctlz(i64 %b) nounwind {
 ; RV64I-LABEL: ctlz:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 32
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addiw a2, a3, 819
+; RV64I-NEXT:    srli a3, a0, 2
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    slli a3, a2, 32
 ; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a3, a0, 4
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 16
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 32
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    addiw a3, a3, -241
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    slli a2, a3, 32
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a2, a3, a2
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 16

diff  --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll
index 3d9fb91e3adf82..844fa0d1e6ad6e 100644
--- a/llvm/test/CodeGen/RISCV/div-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll
@@ -80,25 +80,25 @@ define i64 @udiv64_constant_no_add(i64 %a) nounwind {
 ; RV32-LABEL: udiv64_constant_no_add:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    add a2, a0, a1
-; RV32-NEXT:    sltu a3, a2, a0
-; RV32-NEXT:    add a2, a2, a3
 ; RV32-NEXT:    lui a3, 838861
-; RV32-NEXT:    addi a4, a3, -819
-; RV32-NEXT:    mulhu a5, a2, a4
-; RV32-NEXT:    srli a6, a5, 2
-; RV32-NEXT:    andi a5, a5, -4
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    sub a2, a2, a5
-; RV32-NEXT:    sub a5, a0, a2
+; RV32-NEXT:    sltu a4, a2, a0
+; RV32-NEXT:    addi a5, a3, -819
 ; RV32-NEXT:    addi a3, a3, -820
-; RV32-NEXT:    mul a3, a5, a3
-; RV32-NEXT:    mulhu a6, a5, a4
-; RV32-NEXT:    add a3, a6, a3
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    mulhu a4, a2, a5
+; RV32-NEXT:    srli a6, a4, 2
+; RV32-NEXT:    andi a4, a4, -4
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    sub a2, a2, a4
+; RV32-NEXT:    sub a4, a0, a2
 ; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    mul a2, a4, a3
+; RV32-NEXT:    mulhu a3, a4, a5
 ; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    mul a1, a1, a4
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    mul a0, a5, a4
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    mul a1, a1, a5
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    mul a0, a4, a5
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: udiv64_constant_no_add:
@@ -485,8 +485,8 @@ define i8 @sdiv8_constant_no_srai(i8 %a) nounwind {
 ; RV32IM-LABEL: sdiv8_constant_no_srai:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a0, a0, 24
-; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    li a1, 86
+; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    mul a0, a0, a1
 ; RV32IM-NEXT:    srli a1, a0, 31
 ; RV32IM-NEXT:    srli a0, a0, 8
@@ -506,8 +506,8 @@ define i8 @sdiv8_constant_no_srai(i8 %a) nounwind {
 ; RV64IM-LABEL: sdiv8_constant_no_srai:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a0, a0, 56
-; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    li a1, 86
+; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    mul a0, a0, a1
 ; RV64IM-NEXT:    srli a1, a0, 63
 ; RV64IM-NEXT:    srli a0, a0, 8
@@ -531,8 +531,8 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
 ; RV32IM-LABEL: sdiv8_constant_srai:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a0, a0, 24
-; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    li a1, 103
+; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    mul a0, a0, a1
 ; RV32IM-NEXT:    srli a1, a0, 31
 ; RV32IM-NEXT:    srai a0, a0, 9
@@ -552,8 +552,8 @@ define i8 @sdiv8_constant_srai(i8 %a) nounwind {
 ; RV64IM-LABEL: sdiv8_constant_srai:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a0, a0, 56
-; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    li a1, 103
+; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    mul a0, a0, a1
 ; RV64IM-NEXT:    srli a1, a0, 63
 ; RV64IM-NEXT:    srai a0, a0, 9
@@ -577,8 +577,8 @@ define i8 @sdiv8_constant_add_srai(i8 %a) nounwind {
 ; RV32IM-LABEL: sdiv8_constant_add_srai:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a1, a0, 24
-; RV32IM-NEXT:    srai a1, a1, 24
 ; RV32IM-NEXT:    li a2, -109
+; RV32IM-NEXT:    srai a1, a1, 24
 ; RV32IM-NEXT:    mul a1, a1, a2
 ; RV32IM-NEXT:    srli a1, a1, 8
 ; RV32IM-NEXT:    add a0, a1, a0
@@ -604,8 +604,8 @@ define i8 @sdiv8_constant_add_srai(i8 %a) nounwind {
 ; RV64IM-LABEL: sdiv8_constant_add_srai:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a0, 56
-; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    li a2, -109
+; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 8
 ; RV64IM-NEXT:    add a0, a1, a0
@@ -635,8 +635,8 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind {
 ; RV32IM-LABEL: sdiv8_constant_sub_srai:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a1, a0, 24
-; RV32IM-NEXT:    srai a1, a1, 24
 ; RV32IM-NEXT:    li a2, 109
+; RV32IM-NEXT:    srai a1, a1, 24
 ; RV32IM-NEXT:    mul a1, a1, a2
 ; RV32IM-NEXT:    srli a1, a1, 8
 ; RV32IM-NEXT:    sub a1, a1, a0
@@ -662,8 +662,8 @@ define i8 @sdiv8_constant_sub_srai(i8 %a) nounwind {
 ; RV64IM-LABEL: sdiv8_constant_sub_srai:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a0, 56
-; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    li a2, 109
+; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 8
 ; RV64IM-NEXT:    subw a1, a1, a0
@@ -693,8 +693,8 @@ define i16 @sdiv16_constant_no_srai(i16 %a) nounwind {
 ; RV32IM-LABEL: sdiv16_constant_no_srai:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a0, a0, 16
-; RV32IM-NEXT:    srai a0, a0, 16
 ; RV32IM-NEXT:    lui a1, 5
+; RV32IM-NEXT:    srai a0, a0, 16
 ; RV32IM-NEXT:    addi a1, a1, 1366
 ; RV32IM-NEXT:    mul a0, a0, a1
 ; RV32IM-NEXT:    srli a1, a0, 31
@@ -716,8 +716,8 @@ define i16 @sdiv16_constant_no_srai(i16 %a) nounwind {
 ; RV64IM-LABEL: sdiv16_constant_no_srai:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a0, a0, 48
-; RV64IM-NEXT:    srai a0, a0, 48
 ; RV64IM-NEXT:    lui a1, 5
+; RV64IM-NEXT:    srai a0, a0, 48
 ; RV64IM-NEXT:    addiw a1, a1, 1366
 ; RV64IM-NEXT:    mul a0, a0, a1
 ; RV64IM-NEXT:    srli a1, a0, 63
@@ -743,8 +743,8 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind {
 ; RV32IM-LABEL: sdiv16_constant_srai:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a0, a0, 16
-; RV32IM-NEXT:    srai a0, a0, 16
 ; RV32IM-NEXT:    lui a1, 6
+; RV32IM-NEXT:    srai a0, a0, 16
 ; RV32IM-NEXT:    addi a1, a1, 1639
 ; RV32IM-NEXT:    mul a0, a0, a1
 ; RV32IM-NEXT:    srli a1, a0, 31
@@ -766,8 +766,8 @@ define i16 @sdiv16_constant_srai(i16 %a) nounwind {
 ; RV64IM-LABEL: sdiv16_constant_srai:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a0, a0, 48
-; RV64IM-NEXT:    srai a0, a0, 48
 ; RV64IM-NEXT:    lui a1, 6
+; RV64IM-NEXT:    srai a0, a0, 48
 ; RV64IM-NEXT:    addiw a1, a1, 1639
 ; RV64IM-NEXT:    mul a0, a0, a1
 ; RV64IM-NEXT:    srli a1, a0, 63
@@ -793,8 +793,8 @@ define i16 @sdiv16_constant_add_srai(i16 %a) nounwind {
 ; RV32IM-LABEL: sdiv16_constant_add_srai:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a1, a0, 16
-; RV32IM-NEXT:    srai a1, a1, 16
 ; RV32IM-NEXT:    lui a2, 1048569
+; RV32IM-NEXT:    srai a1, a1, 16
 ; RV32IM-NEXT:    addi a2, a2, -1911
 ; RV32IM-NEXT:    mul a1, a1, a2
 ; RV32IM-NEXT:    srli a1, a1, 16
@@ -822,8 +822,8 @@ define i16 @sdiv16_constant_add_srai(i16 %a) nounwind {
 ; RV64IM-LABEL: sdiv16_constant_add_srai:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a0, 48
-; RV64IM-NEXT:    srai a1, a1, 48
 ; RV64IM-NEXT:    lui a2, 1048569
+; RV64IM-NEXT:    srai a1, a1, 48
 ; RV64IM-NEXT:    addiw a2, a2, -1911
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 16
@@ -855,8 +855,8 @@ define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind {
 ; RV32IM-LABEL: sdiv16_constant_sub_srai:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a1, a0, 16
-; RV32IM-NEXT:    srai a1, a1, 16
 ; RV32IM-NEXT:    lui a2, 7
+; RV32IM-NEXT:    srai a1, a1, 16
 ; RV32IM-NEXT:    addi a2, a2, 1911
 ; RV32IM-NEXT:    mul a1, a1, a2
 ; RV32IM-NEXT:    srli a1, a1, 16
@@ -884,8 +884,8 @@ define i16 @sdiv16_constant_sub_srai(i16 %a) nounwind {
 ; RV64IM-LABEL: sdiv16_constant_sub_srai:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a0, 48
-; RV64IM-NEXT:    srai a1, a1, 48
 ; RV64IM-NEXT:    lui a2, 7
+; RV64IM-NEXT:    srai a1, a1, 48
 ; RV64IM-NEXT:    addiw a2, a2, 1911
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 16

diff  --git a/llvm/test/CodeGen/RISCV/div-pow2.ll b/llvm/test/CodeGen/RISCV/div-pow2.ll
index 254e675b4ed8b5..6ea5a37ba29635 100644
--- a/llvm/test/CodeGen/RISCV/div-pow2.ll
+++ b/llvm/test/CodeGen/RISCV/div-pow2.ll
@@ -207,14 +207,14 @@ define i64 @sdiv64_pow2_negative_2(i64 %a) {
 ; RV32I-NEXT:    add a2, a0, a2
 ; RV32I-NEXT:    srli a3, a2, 1
 ; RV32I-NEXT:    sltu a0, a2, a0
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    slli a0, a1, 31
-; RV32I-NEXT:    or a3, a3, a0
-; RV32I-NEXT:    neg a0, a3
-; RV32I-NEXT:    snez a2, a3
-; RV32I-NEXT:    srai a1, a1, 1
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    slli a1, a0, 31
+; RV32I-NEXT:    srai a2, a0, 1
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    snez a1, a1
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: sdiv64_pow2_negative_2:
@@ -263,14 +263,14 @@ define i64 @sdiv64_pow2_negative_2048(i64 %a) {
 ; RV32I-NEXT:    add a2, a0, a2
 ; RV32I-NEXT:    srli a3, a2, 11
 ; RV32I-NEXT:    sltu a0, a2, a0
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    slli a0, a1, 21
-; RV32I-NEXT:    or a3, a3, a0
-; RV32I-NEXT:    neg a0, a3
-; RV32I-NEXT:    snez a2, a3
-; RV32I-NEXT:    srai a1, a1, 11
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    slli a1, a0, 21
+; RV32I-NEXT:    srai a2, a0, 11
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    snez a1, a1
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: sdiv64_pow2_negative_2048:
@@ -320,14 +320,14 @@ define i64 @sdiv64_pow2_negative_4096(i64 %a) {
 ; RV32I-NEXT:    add a2, a0, a2
 ; RV32I-NEXT:    srli a3, a2, 12
 ; RV32I-NEXT:    sltu a0, a2, a0
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    slli a0, a1, 20
-; RV32I-NEXT:    or a3, a3, a0
-; RV32I-NEXT:    neg a0, a3
-; RV32I-NEXT:    snez a2, a3
-; RV32I-NEXT:    srai a1, a1, 12
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    slli a1, a0, 20
+; RV32I-NEXT:    srai a2, a0, 12
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    snez a1, a1
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: sdiv64_pow2_negative_4096:
@@ -377,14 +377,14 @@ define i64 @sdiv64_pow2_negative_65536(i64 %a) {
 ; RV32I-NEXT:    add a2, a0, a2
 ; RV32I-NEXT:    srli a3, a2, 16
 ; RV32I-NEXT:    sltu a0, a2, a0
-; RV32I-NEXT:    add a1, a1, a0
-; RV32I-NEXT:    slli a0, a1, 16
-; RV32I-NEXT:    or a3, a3, a0
-; RV32I-NEXT:    neg a0, a3
-; RV32I-NEXT:    snez a2, a3
-; RV32I-NEXT:    srai a1, a1, 16
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    srai a2, a0, 16
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    neg a0, a1
+; RV32I-NEXT:    snez a1, a1
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: sdiv64_pow2_negative_65536:

diff  --git a/llvm/test/CodeGen/RISCV/div.ll b/llvm/test/CodeGen/RISCV/div.ll
index e94efbea8376d5..bda6ff43a5e7c0 100644
--- a/llvm/test/CodeGen/RISCV/div.ll
+++ b/llvm/test/CodeGen/RISCV/div.ll
@@ -23,8 +23,8 @@ define i32 @udiv(i32 %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    call __udivdi3
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -182,25 +182,25 @@ define i64 @udiv64_constant(i64 %a) nounwind {
 ; RV32IM-LABEL: udiv64_constant:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    add a2, a0, a1
-; RV32IM-NEXT:    sltu a3, a2, a0
-; RV32IM-NEXT:    add a2, a2, a3
 ; RV32IM-NEXT:    lui a3, 838861
-; RV32IM-NEXT:    addi a4, a3, -819
-; RV32IM-NEXT:    mulhu a5, a2, a4
-; RV32IM-NEXT:    srli a6, a5, 2
-; RV32IM-NEXT:    andi a5, a5, -4
-; RV32IM-NEXT:    add a5, a5, a6
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    sub a5, a0, a2
+; RV32IM-NEXT:    sltu a4, a2, a0
+; RV32IM-NEXT:    addi a5, a3, -819
 ; RV32IM-NEXT:    addi a3, a3, -820
-; RV32IM-NEXT:    mul a3, a5, a3
-; RV32IM-NEXT:    mulhu a6, a5, a4
-; RV32IM-NEXT:    add a3, a6, a3
+; RV32IM-NEXT:    add a2, a2, a4
+; RV32IM-NEXT:    mulhu a4, a2, a5
+; RV32IM-NEXT:    srli a6, a4, 2
+; RV32IM-NEXT:    andi a4, a4, -4
+; RV32IM-NEXT:    add a4, a4, a6
+; RV32IM-NEXT:    sub a2, a2, a4
+; RV32IM-NEXT:    sub a4, a0, a2
 ; RV32IM-NEXT:    sltu a0, a0, a2
+; RV32IM-NEXT:    mul a2, a4, a3
+; RV32IM-NEXT:    mulhu a3, a4, a5
 ; RV32IM-NEXT:    sub a1, a1, a0
-; RV32IM-NEXT:    mul a1, a1, a4
-; RV32IM-NEXT:    add a1, a3, a1
-; RV32IM-NEXT:    mul a0, a5, a4
+; RV32IM-NEXT:    add a2, a3, a2
+; RV32IM-NEXT:    mul a1, a1, a5
+; RV32IM-NEXT:    add a1, a2, a1
+; RV32IM-NEXT:    mul a0, a4, a5
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: udiv64_constant:
@@ -919,8 +919,8 @@ define i8 @sdiv8(i8 %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    call __divsi3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -930,8 +930,8 @@ define i8 @sdiv8(i8 %a, i8 %b) nounwind {
 ; RV32IM-LABEL: sdiv8:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a1, a1, 24
-; RV32IM-NEXT:    srai a1, a1, 24
 ; RV32IM-NEXT:    slli a0, a0, 24
+; RV32IM-NEXT:    srai a1, a1, 24
 ; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    div a0, a0, a1
 ; RV32IM-NEXT:    ret
@@ -941,8 +941,8 @@ define i8 @sdiv8(i8 %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    call __divdi3
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -952,8 +952,8 @@ define i8 @sdiv8(i8 %a, i8 %b) nounwind {
 ; RV64IM-LABEL: sdiv8:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a1, 56
-; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    slli a0, a0, 56
+; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    divw a0, a0, a1
 ; RV64IM-NEXT:    ret
@@ -977,8 +977,8 @@ define i8 @sdiv8_constant(i8 %a) nounwind {
 ; RV32IM-LABEL: sdiv8_constant:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a0, a0, 24
-; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    li a1, 103
+; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    mul a0, a0, a1
 ; RV32IM-NEXT:    srli a1, a0, 31
 ; RV32IM-NEXT:    srai a0, a0, 9
@@ -1000,8 +1000,8 @@ define i8 @sdiv8_constant(i8 %a) nounwind {
 ; RV64IM-LABEL: sdiv8_constant:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a0, a0, 56
-; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    li a1, 103
+; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    mul a0, a0, a1
 ; RV64IM-NEXT:    srli a1, a0, 63
 ; RV64IM-NEXT:    srai a0, a0, 9
@@ -1105,8 +1105,8 @@ define i16 @sdiv16(i16 %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    call __divsi3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -1116,8 +1116,8 @@ define i16 @sdiv16(i16 %a, i16 %b) nounwind {
 ; RV32IM-LABEL: sdiv16:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a1, a1, 16
-; RV32IM-NEXT:    srai a1, a1, 16
 ; RV32IM-NEXT:    slli a0, a0, 16
+; RV32IM-NEXT:    srai a1, a1, 16
 ; RV32IM-NEXT:    srai a0, a0, 16
 ; RV32IM-NEXT:    div a0, a0, a1
 ; RV32IM-NEXT:    ret
@@ -1127,8 +1127,8 @@ define i16 @sdiv16(i16 %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    call __divdi3
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -1138,8 +1138,8 @@ define i16 @sdiv16(i16 %a, i16 %b) nounwind {
 ; RV64IM-LABEL: sdiv16:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a1, 48
-; RV64IM-NEXT:    srai a1, a1, 48
 ; RV64IM-NEXT:    slli a0, a0, 48
+; RV64IM-NEXT:    srai a1, a1, 48
 ; RV64IM-NEXT:    srai a0, a0, 48
 ; RV64IM-NEXT:    divw a0, a0, a1
 ; RV64IM-NEXT:    ret
@@ -1163,8 +1163,8 @@ define i16 @sdiv16_constant(i16 %a) nounwind {
 ; RV32IM-LABEL: sdiv16_constant:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a0, a0, 16
-; RV32IM-NEXT:    srai a0, a0, 16
 ; RV32IM-NEXT:    lui a1, 6
+; RV32IM-NEXT:    srai a0, a0, 16
 ; RV32IM-NEXT:    addi a1, a1, 1639
 ; RV32IM-NEXT:    mul a0, a0, a1
 ; RV32IM-NEXT:    srli a1, a0, 31
@@ -1187,8 +1187,8 @@ define i16 @sdiv16_constant(i16 %a) nounwind {
 ; RV64IM-LABEL: sdiv16_constant:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a0, a0, 48
-; RV64IM-NEXT:    srai a0, a0, 48
 ; RV64IM-NEXT:    lui a1, 6
+; RV64IM-NEXT:    srai a0, a0, 48
 ; RV64IM-NEXT:    addiw a1, a1, 1639
 ; RV64IM-NEXT:    mul a0, a0, a1
 ; RV64IM-NEXT:    srli a1, a0, 63

diff  --git a/llvm/test/CodeGen/RISCV/double-arith.ll b/llvm/test/CodeGen/RISCV/double-arith.ll
index 5f06398daa8b9a..44d7bc590a797b 100644
--- a/llvm/test/CodeGen/RISCV/double-arith.ll
+++ b/llvm/test/CodeGen/RISCV/double-arith.ll
@@ -225,8 +225,8 @@ define double @fsgnj_d(double %a, double %b) nounwind {
 ; RV32I-LABEL: fsgnj_d:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    and a2, a3, a2
 ; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    and a2, a3, a2
 ; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    ret
@@ -234,8 +234,8 @@ define double @fsgnj_d(double %a, double %b) nounwind {
 ; RV64I-LABEL: fsgnj_d:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a1, 63
-; RV64I-NEXT:    slli a1, a1, 63
 ; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    slli a1, a1, 63
 ; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -327,8 +327,8 @@ define double @fsgnjn_d(double %a, double %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    not a2, a3
 ; RV32I-NEXT:    lui a3, 524288
-; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    ret
@@ -1524,8 +1524,8 @@ define double @fsgnjx_f64(double %x, double %y) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    srli a0, a0, 63
-; RV64I-NEXT:    slli a0, a0, 63
 ; RV64I-NEXT:    li a2, 1023
+; RV64I-NEXT:    slli a0, a0, 63
 ; RV64I-NEXT:    slli a2, a2, 52
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    call __muldf3

diff  --git a/llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll b/llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll
index 01aa25c15c8d2b..14193bf4cb169e 100644
--- a/llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll
+++ b/llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll
@@ -112,8 +112,8 @@ define double @fcopysign_fneg(double %a, double %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    not a2, a3
 ; RV32I-NEXT:    lui a3, 524288
-; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/double-calling-conv.ll b/llvm/test/CodeGen/RISCV/double-calling-conv.ll
index b9e80dccd97b9a..798eac64e9fc26 100644
--- a/llvm/test/CodeGen/RISCV/double-calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/double-calling-conv.ll
@@ -42,11 +42,11 @@ define double @caller_double_inreg() nounwind {
 ; RV32IFD-NEXT:    addi sp, sp, -16
 ; RV32IFD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IFD-NEXT:    lui a0, 262236
+; RV32IFD-NEXT:    lui a2, 377487
+; RV32IFD-NEXT:    lui a3, 262364
 ; RV32IFD-NEXT:    addi a1, a0, 655
-; RV32IFD-NEXT:    lui a0, 377487
-; RV32IFD-NEXT:    addi a0, a0, 1475
-; RV32IFD-NEXT:    lui a2, 262364
-; RV32IFD-NEXT:    addi a3, a2, 655
+; RV32IFD-NEXT:    addi a0, a2, 1475
+; RV32IFD-NEXT:    addi a3, a3, 655
 ; RV32IFD-NEXT:    mv a2, a0
 ; RV32IFD-NEXT:    call callee_double_inreg
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -58,11 +58,11 @@ define double @caller_double_inreg() nounwind {
 ; RV32IZFINXZDINX-NEXT:    addi sp, sp, -16
 ; RV32IZFINXZDINX-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32IZFINXZDINX-NEXT:    lui a0, 262236
+; RV32IZFINXZDINX-NEXT:    lui a2, 377487
+; RV32IZFINXZDINX-NEXT:    lui a3, 262364
 ; RV32IZFINXZDINX-NEXT:    addi a1, a0, 655
-; RV32IZFINXZDINX-NEXT:    lui a0, 377487
-; RV32IZFINXZDINX-NEXT:    addi a0, a0, 1475
-; RV32IZFINXZDINX-NEXT:    lui a2, 262364
-; RV32IZFINXZDINX-NEXT:    addi a3, a2, 655
+; RV32IZFINXZDINX-NEXT:    addi a0, a2, 1475
+; RV32IZFINXZDINX-NEXT:    addi a3, a3, 655
 ; RV32IZFINXZDINX-NEXT:    mv a2, a0
 ; RV32IZFINXZDINX-NEXT:    call callee_double_inreg
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -107,14 +107,14 @@ define double @caller_double_split_reg_stack() nounwind {
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
 ; RV32IFD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    lui a0, 262510
-; RV32IFD-NEXT:    addi a2, a0, 327
-; RV32IFD-NEXT:    lui a0, 262446
-; RV32IFD-NEXT:    addi a6, a0, 327
-; RV32IFD-NEXT:    lui a0, 713032
-; RV32IFD-NEXT:    addi a5, a0, -1311
+; RV32IFD-NEXT:    lui a2, 262510
+; RV32IFD-NEXT:    lui a3, 262446
+; RV32IFD-NEXT:    lui a4, 713032
 ; RV32IFD-NEXT:    li a0, 1
 ; RV32IFD-NEXT:    li a1, 2
+; RV32IFD-NEXT:    addi a2, a2, 327
+; RV32IFD-NEXT:    addi a6, a3, 327
+; RV32IFD-NEXT:    addi a5, a4, -1311
 ; RV32IFD-NEXT:    li a3, 3
 ; RV32IFD-NEXT:    sw a2, 0(sp)
 ; RV32IFD-NEXT:    li a2, 0
@@ -129,14 +129,14 @@ define double @caller_double_split_reg_stack() nounwind {
 ; RV32IZFINXZDINX:       # %bb.0:
 ; RV32IZFINXZDINX-NEXT:    addi sp, sp, -16
 ; RV32IZFINXZDINX-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    lui a0, 262510
-; RV32IZFINXZDINX-NEXT:    addi a2, a0, 327
-; RV32IZFINXZDINX-NEXT:    lui a0, 262446
-; RV32IZFINXZDINX-NEXT:    addi a6, a0, 327
-; RV32IZFINXZDINX-NEXT:    lui a0, 713032
-; RV32IZFINXZDINX-NEXT:    addi a5, a0, -1311
+; RV32IZFINXZDINX-NEXT:    lui a2, 262510
+; RV32IZFINXZDINX-NEXT:    lui a3, 262446
+; RV32IZFINXZDINX-NEXT:    lui a4, 713032
 ; RV32IZFINXZDINX-NEXT:    li a0, 1
 ; RV32IZFINXZDINX-NEXT:    li a1, 2
+; RV32IZFINXZDINX-NEXT:    addi a2, a2, 327
+; RV32IZFINXZDINX-NEXT:    addi a6, a3, 327
+; RV32IZFINXZDINX-NEXT:    addi a5, a4, -1311
 ; RV32IZFINXZDINX-NEXT:    li a3, 3
 ; RV32IZFINXZDINX-NEXT:    sw a2, 0(sp)
 ; RV32IZFINXZDINX-NEXT:    li a2, 0
@@ -180,16 +180,16 @@ define double @caller_double_stack() nounwind {
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -32
 ; RV32IFD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    lui a0, 262510
-; RV32IFD-NEXT:    addi a1, a0, 327
-; RV32IFD-NEXT:    lui a0, 713032
-; RV32IFD-NEXT:    addi a3, a0, -1311
-; RV32IFD-NEXT:    lui a0, 262574
-; RV32IFD-NEXT:    addi a5, a0, 327
+; RV32IFD-NEXT:    lui a1, 262510
+; RV32IFD-NEXT:    lui a3, 713032
+; RV32IFD-NEXT:    lui a5, 262574
 ; RV32IFD-NEXT:    li a0, 1
 ; RV32IFD-NEXT:    li a2, 2
 ; RV32IFD-NEXT:    li a4, 3
 ; RV32IFD-NEXT:    li a6, 4
+; RV32IFD-NEXT:    addi a1, a1, 327
+; RV32IFD-NEXT:    addi a3, a3, -1311
+; RV32IFD-NEXT:    addi a5, a5, 327
 ; RV32IFD-NEXT:    sw a3, 0(sp)
 ; RV32IFD-NEXT:    sw a1, 4(sp)
 ; RV32IFD-NEXT:    sw a3, 8(sp)
@@ -207,16 +207,16 @@ define double @caller_double_stack() nounwind {
 ; RV32IZFINXZDINX:       # %bb.0:
 ; RV32IZFINXZDINX-NEXT:    addi sp, sp, -32
 ; RV32IZFINXZDINX-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    lui a0, 262510
-; RV32IZFINXZDINX-NEXT:    addi a1, a0, 327
-; RV32IZFINXZDINX-NEXT:    lui a0, 713032
-; RV32IZFINXZDINX-NEXT:    addi a3, a0, -1311
-; RV32IZFINXZDINX-NEXT:    lui a0, 262574
-; RV32IZFINXZDINX-NEXT:    addi a5, a0, 327
+; RV32IZFINXZDINX-NEXT:    lui a1, 262510
+; RV32IZFINXZDINX-NEXT:    lui a3, 713032
+; RV32IZFINXZDINX-NEXT:    lui a5, 262574
 ; RV32IZFINXZDINX-NEXT:    li a0, 1
 ; RV32IZFINXZDINX-NEXT:    li a2, 2
 ; RV32IZFINXZDINX-NEXT:    li a4, 3
 ; RV32IZFINXZDINX-NEXT:    li a6, 4
+; RV32IZFINXZDINX-NEXT:    addi a1, a1, 327
+; RV32IZFINXZDINX-NEXT:    addi a3, a3, -1311
+; RV32IZFINXZDINX-NEXT:    addi a5, a5, 327
 ; RV32IZFINXZDINX-NEXT:    sw a3, 0(sp)
 ; RV32IZFINXZDINX-NEXT:    sw a1, 4(sp)
 ; RV32IZFINXZDINX-NEXT:    sw a3, 8(sp)

diff  --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll
index a8b141618bbb3a..c39085a80ddc10 100644
--- a/llvm/test/CodeGen/RISCV/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert.ll
@@ -692,7 +692,7 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a4, 524288
+; RV32IFD-NEXT:    lui a3, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
 ; RV32IFD-NEXT:    beqz s0, .LBB12_2
 ; RV32IFD-NEXT:  # %bb.1: # %start
@@ -700,19 +700,19 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32IFD-NEXT:  .LBB12_2: # %start
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI12_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI12_1)(a1)
-; RV32IFD-NEXT:    flt.d a3, fa5, fs0
-; RV32IFD-NEXT:    beqz a3, .LBB12_4
+; RV32IFD-NEXT:    flt.d a1, fa5, fs0
+; RV32IFD-NEXT:    beqz a1, .LBB12_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a4, -1
+; RV32IFD-NEXT:    addi a2, a3, -1
 ; RV32IFD-NEXT:  .LBB12_4: # %start
-; RV32IFD-NEXT:    feq.d a1, fs0, fs0
+; RV32IFD-NEXT:    feq.d a3, fs0, fs0
 ; RV32IFD-NEXT:    neg a4, a1
-; RV32IFD-NEXT:    and a1, a4, a2
-; RV32IFD-NEXT:    neg a2, a3
-; RV32IFD-NEXT:    neg a3, s0
+; RV32IFD-NEXT:    neg a1, s0
+; RV32IFD-NEXT:    neg a3, a3
+; RV32IFD-NEXT:    and a0, a1, a0
+; RV32IFD-NEXT:    and a1, a3, a2
+; RV32IFD-NEXT:    or a0, a4, a0
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    or a0, a2, a0
-; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -740,29 +740,29 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI12_0)
 ; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI12_0+4)(a2)
 ; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI12_0)(a2)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
-; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    lui a3, 524288
-; RV32IZFINXZDINX-NEXT:    beqz a2, .LBB12_2
+; RV32IZFINXZDINX-NEXT:    fle.d a3, a2, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, 524288
+; RV32IZFINXZDINX-NEXT:    lui a2, 524288
+; RV32IZFINXZDINX-NEXT:    beqz a3, .LBB12_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1: # %start
-; RV32IZFINXZDINX-NEXT:    mv a3, a1
+; RV32IZFINXZDINX-NEXT:    mv a2, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB12_2: # %start
 ; RV32IZFINXZDINX-NEXT:    lui a1, %hi(.LCPI12_1)
 ; RV32IZFINXZDINX-NEXT:    lw a6, %lo(.LCPI12_1)(a1)
 ; RV32IZFINXZDINX-NEXT:    lw a7, %lo(.LCPI12_1+4)(a1)
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a6, s0
-; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB12_4
+; RV32IZFINXZDINX-NEXT:    flt.d a1, a6, s0
+; RV32IZFINXZDINX-NEXT:    beqz a1, .LBB12_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
-; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
+; RV32IZFINXZDINX-NEXT:    addi a2, a4, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB12_4: # %start
-; RV32IZFINXZDINX-NEXT:    feq.d a1, s0, s0
+; RV32IZFINXZDINX-NEXT:    feq.d a4, s0, s0
+; RV32IZFINXZDINX-NEXT:    neg a3, a3
 ; RV32IZFINXZDINX-NEXT:    neg a5, a1
-; RV32IZFINXZDINX-NEXT:    and a1, a5, a3
-; RV32IZFINXZDINX-NEXT:    neg a2, a2
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    neg a4, a4
+; RV32IZFINXZDINX-NEXT:    and a0, a3, a0
+; RV32IZFINXZDINX-NEXT:    and a1, a4, a2
+; RV32IZFINXZDINX-NEXT:    or a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    and a0, a4, a0
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -800,14 +800,14 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32I-NEXT:    mv a1, s0
 ; RV32I-NEXT:    li a2, 0
 ; RV32I-NEXT:    call __gedf2
-; RV32I-NEXT:    mv s3, a0
+; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    mv a1, s0
 ; RV32I-NEXT:    call __fixdfdi
-; RV32I-NEXT:    mv s4, a0
+; RV32I-NEXT:    mv s3, a0
 ; RV32I-NEXT:    mv s5, a1
 ; RV32I-NEXT:    lui a0, 524288
-; RV32I-NEXT:    bgez s3, .LBB12_2
+; RV32I-NEXT:    bgez s4, .LBB12_2
 ; RV32I-NEXT:  # %bb.1: # %start
 ; RV32I-NEXT:    lui s5, 524288
 ; RV32I-NEXT:  .LBB12_2: # %start
@@ -821,14 +821,14 @@ define i64 @fcvt_l_d_sat(double %a) nounwind {
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    call __unorddf2
 ; RV32I-NEXT:    snez a0, a0
+; RV32I-NEXT:    slti a1, s4, 0
+; RV32I-NEXT:    sgtz a2, s2
 ; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    addi a3, a1, -1
 ; RV32I-NEXT:    and a1, a0, s5
-; RV32I-NEXT:    slti a2, s3, 0
-; RV32I-NEXT:    addi a2, a2, -1
-; RV32I-NEXT:    and a2, a2, s4
-; RV32I-NEXT:    sgtz a3, s2
-; RV32I-NEXT:    neg a3, a3
-; RV32I-NEXT:    or a2, a3, a2
+; RV32I-NEXT:    and a3, a3, s3
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    or a2, a2, a3
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -954,10 +954,10 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind {
 ; RV32IFD-NEXT:    lui a2, %hi(.LCPI14_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI14_0)(a2)
 ; RV32IFD-NEXT:    and a0, s0, a0
+; RV32IFD-NEXT:    and a1, s0, a1
 ; RV32IFD-NEXT:    flt.d a2, fa5, fs0
 ; RV32IFD-NEXT:    neg a2, a2
 ; RV32IFD-NEXT:    or a0, a2, a0
-; RV32IFD-NEXT:    and a1, s0, a1
 ; RV32IFD-NEXT:    or a1, a2, a1
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -985,16 +985,16 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind {
 ; RV32IZFINXZDINX-NEXT:    call __fixunsdfdi
 ; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
 ; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI14_0)
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI14_0+4)(a4)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI14_0)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a3, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a3, a3
-; RV32IZFINXZDINX-NEXT:    or a0, a3, a0
 ; RV32IZFINXZDINX-NEXT:    and a1, a2, a1
-; RV32IZFINXZDINX-NEXT:    or a1, a3, a1
+; RV32IZFINXZDINX-NEXT:    flt.d a2, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    or a1, a2, a1
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1037,8 +1037,8 @@ define i64 @fcvt_lu_d_sat(double %a) nounwind {
 ; RV32I-NEXT:    mv a1, s0
 ; RV32I-NEXT:    call __fixunsdfdi
 ; RV32I-NEXT:    and a0, s3, a0
-; RV32I-NEXT:    or a0, s2, a0
 ; RV32I-NEXT:    and a1, s3, a1
+; RV32I-NEXT:    or a0, s2, a0
 ; RV32I-NEXT:    or a1, s2, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -1629,8 +1629,8 @@ define signext i16 @fcvt_w_s_sat_i16(double %a) nounwind {
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI26_1)
 ; RV32IFD-NEXT:    fld fa4, %lo(.LCPI26_1)(a0)
 ; RV32IFD-NEXT:    feq.d a0, fa0, fa0
-; RV32IFD-NEXT:    neg a0, a0
 ; RV32IFD-NEXT:    fmax.d fa5, fa0, fa5
+; RV32IFD-NEXT:    neg a0, a0
 ; RV32IFD-NEXT:    fmin.d fa5, fa5, fa4
 ; RV32IFD-NEXT:    fcvt.w.d a1, fa5, rtz
 ; RV32IFD-NEXT:    and a0, a0, a1
@@ -1643,8 +1643,8 @@ define signext i16 @fcvt_w_s_sat_i16(double %a) nounwind {
 ; RV64IFD-NEXT:    lui a0, %hi(.LCPI26_1)
 ; RV64IFD-NEXT:    fld fa4, %lo(.LCPI26_1)(a0)
 ; RV64IFD-NEXT:    feq.d a0, fa0, fa0
-; RV64IFD-NEXT:    neg a0, a0
 ; RV64IFD-NEXT:    fmax.d fa5, fa0, fa5
+; RV64IFD-NEXT:    neg a0, a0
 ; RV64IFD-NEXT:    fmin.d fa5, fa5, fa4
 ; RV64IFD-NEXT:    fcvt.l.d a1, fa5, rtz
 ; RV64IFD-NEXT:    and a0, a0, a1
@@ -1658,26 +1658,26 @@ define signext i16 @fcvt_w_s_sat_i16(double %a) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI26_1)
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI26_1+4)(a4)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI26_1)(a4)
-; RV32IZFINXZDINX-NEXT:    feq.d a6, a0, a0
-; RV32IZFINXZDINX-NEXT:    neg a6, a6
-; RV32IZFINXZDINX-NEXT:    fmax.d a0, a0, a2
-; RV32IZFINXZDINX-NEXT:    fmin.d a0, a0, a4
-; RV32IZFINXZDINX-NEXT:    fcvt.w.d a0, a0, rtz
-; RV32IZFINXZDINX-NEXT:    and a0, a6, a0
+; RV32IZFINXZDINX-NEXT:    fmax.d a2, a0, a2
+; RV32IZFINXZDINX-NEXT:    feq.d a0, a0, a0
+; RV32IZFINXZDINX-NEXT:    neg a0, a0
+; RV32IZFINXZDINX-NEXT:    fmin.d a2, a2, a4
+; RV32IZFINXZDINX-NEXT:    fcvt.w.d a1, a2, rtz
+; RV32IZFINXZDINX-NEXT:    and a0, a0, a1
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fcvt_w_s_sat_i16:
 ; RV64IZFINXZDINX:       # %bb.0: # %start
-; RV64IZFINXZDINX-NEXT:    feq.d a1, a0, a0
-; RV64IZFINXZDINX-NEXT:    neg a1, a1
+; RV64IZFINXZDINX-NEXT:    li a1, -505
 ; RV64IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI26_0)
+; RV64IZFINXZDINX-NEXT:    slli a1, a1, 53
 ; RV64IZFINXZDINX-NEXT:    ld a2, %lo(.LCPI26_0)(a2)
-; RV64IZFINXZDINX-NEXT:    li a3, -505
-; RV64IZFINXZDINX-NEXT:    slli a3, a3, 53
-; RV64IZFINXZDINX-NEXT:    fmax.d a0, a0, a3
-; RV64IZFINXZDINX-NEXT:    fmin.d a0, a0, a2
-; RV64IZFINXZDINX-NEXT:    fcvt.l.d a0, a0, rtz
-; RV64IZFINXZDINX-NEXT:    and a0, a1, a0
+; RV64IZFINXZDINX-NEXT:    fmax.d a1, a0, a1
+; RV64IZFINXZDINX-NEXT:    feq.d a0, a0, a0
+; RV64IZFINXZDINX-NEXT:    neg a0, a0
+; RV64IZFINXZDINX-NEXT:    fmin.d a1, a1, a2
+; RV64IZFINXZDINX-NEXT:    fcvt.l.d a1, a1, rtz
+; RV64IZFINXZDINX-NEXT:    and a0, a0, a1
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_w_s_sat_i16:
@@ -2004,8 +2004,8 @@ define signext i8 @fcvt_w_s_sat_i8(double %a) nounwind {
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI30_1)
 ; RV32IFD-NEXT:    fld fa4, %lo(.LCPI30_1)(a0)
 ; RV32IFD-NEXT:    feq.d a0, fa0, fa0
-; RV32IFD-NEXT:    neg a0, a0
 ; RV32IFD-NEXT:    fmax.d fa5, fa0, fa5
+; RV32IFD-NEXT:    neg a0, a0
 ; RV32IFD-NEXT:    fmin.d fa5, fa5, fa4
 ; RV32IFD-NEXT:    fcvt.w.d a1, fa5, rtz
 ; RV32IFD-NEXT:    and a0, a0, a1
@@ -2018,8 +2018,8 @@ define signext i8 @fcvt_w_s_sat_i8(double %a) nounwind {
 ; RV64IFD-NEXT:    lui a0, %hi(.LCPI30_1)
 ; RV64IFD-NEXT:    fld fa4, %lo(.LCPI30_1)(a0)
 ; RV64IFD-NEXT:    feq.d a0, fa0, fa0
-; RV64IFD-NEXT:    neg a0, a0
 ; RV64IFD-NEXT:    fmax.d fa5, fa0, fa5
+; RV64IFD-NEXT:    neg a0, a0
 ; RV64IFD-NEXT:    fmin.d fa5, fa5, fa4
 ; RV64IFD-NEXT:    fcvt.l.d a1, fa5, rtz
 ; RV64IFD-NEXT:    and a0, a0, a1
@@ -2033,22 +2033,22 @@ define signext i8 @fcvt_w_s_sat_i8(double %a) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI30_1)
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI30_1+4)(a4)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI30_1)(a4)
-; RV32IZFINXZDINX-NEXT:    feq.d a6, a0, a0
-; RV32IZFINXZDINX-NEXT:    neg a6, a6
-; RV32IZFINXZDINX-NEXT:    fmax.d a0, a0, a2
-; RV32IZFINXZDINX-NEXT:    fmin.d a0, a0, a4
-; RV32IZFINXZDINX-NEXT:    fcvt.w.d a0, a0, rtz
-; RV32IZFINXZDINX-NEXT:    and a0, a6, a0
+; RV32IZFINXZDINX-NEXT:    fmax.d a2, a0, a2
+; RV32IZFINXZDINX-NEXT:    feq.d a0, a0, a0
+; RV32IZFINXZDINX-NEXT:    neg a0, a0
+; RV32IZFINXZDINX-NEXT:    fmin.d a2, a2, a4
+; RV32IZFINXZDINX-NEXT:    fcvt.w.d a1, a2, rtz
+; RV32IZFINXZDINX-NEXT:    and a0, a0, a1
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fcvt_w_s_sat_i8:
 ; RV64IZFINXZDINX:       # %bb.0: # %start
 ; RV64IZFINXZDINX-NEXT:    feq.d a1, a0, a0
-; RV64IZFINXZDINX-NEXT:    neg a1, a1
 ; RV64IZFINXZDINX-NEXT:    li a2, -509
 ; RV64IZFINXZDINX-NEXT:    slli a2, a2, 53
 ; RV64IZFINXZDINX-NEXT:    fmax.d a0, a0, a2
 ; RV64IZFINXZDINX-NEXT:    lui a2, 65919
+; RV64IZFINXZDINX-NEXT:    neg a1, a1
 ; RV64IZFINXZDINX-NEXT:    slli a2, a2, 34
 ; RV64IZFINXZDINX-NEXT:    fmin.d a0, a0, a2
 ; RV64IZFINXZDINX-NEXT:    fcvt.l.d a0, a0, rtz

diff  --git a/llvm/test/CodeGen/RISCV/double-imm.ll b/llvm/test/CodeGen/RISCV/double-imm.ll
index 2294171d95ab2c..155827ad069cc6 100644
--- a/llvm/test/CodeGen/RISCV/double-imm.ll
+++ b/llvm/test/CodeGen/RISCV/double-imm.ll
@@ -24,8 +24,8 @@ define double @double_imm() nounwind {
 ; CHECKRV32ZDINX-LABEL: double_imm:
 ; CHECKRV32ZDINX:       # %bb.0:
 ; CHECKRV32ZDINX-NEXT:    lui a0, 345155
-; CHECKRV32ZDINX-NEXT:    addi a0, a0, -744
 ; CHECKRV32ZDINX-NEXT:    lui a1, 262290
+; CHECKRV32ZDINX-NEXT:    addi a0, a0, -744
 ; CHECKRV32ZDINX-NEXT:    addi a1, a1, 507
 ; CHECKRV32ZDINX-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/double-intrinsics.ll b/llvm/test/CodeGen/RISCV/double-intrinsics.ll
index a65fd09613424c..3ef128ed6d4cdb 100644
--- a/llvm/test/CodeGen/RISCV/double-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/double-intrinsics.ll
@@ -817,8 +817,8 @@ define double @copysign_f64(double %a, double %b) nounwind {
 ; RV32I-LABEL: copysign_f64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    and a2, a3, a2
 ; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    and a2, a3, a2
 ; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    ret
@@ -826,8 +826,8 @@ define double @copysign_f64(double %a, double %b) nounwind {
 ; RV64I-LABEL: copysign_f64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a1, 63
-; RV64I-NEXT:    slli a1, a1, 63
 ; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    slli a1, a1, 63
 ; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -1535,8 +1535,8 @@ define i1 @isnan_d_fpclass(double %x) {
 ; RV64I-LABEL: isnan_d_fpclass:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 1
-; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    li a1, 2047
+; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    slli a1, a1, 52
 ; RV64I-NEXT:    slt a0, a1, a0
 ; RV64I-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/double-mem.ll b/llvm/test/CodeGen/RISCV/double-mem.ll
index 38cb52b6f4b302..dba9489e7511db 100644
--- a/llvm/test/CodeGen/RISCV/double-mem.ll
+++ b/llvm/test/CodeGen/RISCV/double-mem.ll
@@ -93,17 +93,17 @@ define dso_local double @fld_fsd_global(double %a, double %b) nounwind {
 ;
 ; RV32IZFINXZDINX-LABEL: fld_fsd_global:
 ; RV32IZFINXZDINX:       # %bb.0:
+; RV32IZFINXZDINX-NEXT:    lui a4, %hi(G)
 ; RV32IZFINXZDINX-NEXT:    fadd.d a0, a0, a2
-; RV32IZFINXZDINX-NEXT:    lui a2, %hi(G)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(G)(a2)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(G+4)(a2)
-; RV32IZFINXZDINX-NEXT:    addi a3, a2, %lo(G)
-; RV32IZFINXZDINX-NEXT:    sw a0, %lo(G)(a2)
-; RV32IZFINXZDINX-NEXT:    sw a1, %lo(G+4)(a2)
-; RV32IZFINXZDINX-NEXT:    lw a4, 72(a3)
-; RV32IZFINXZDINX-NEXT:    lw a5, 76(a3)
-; RV32IZFINXZDINX-NEXT:    sw a0, 72(a3)
-; RV32IZFINXZDINX-NEXT:    sw a1, 76(a3)
+; RV32IZFINXZDINX-NEXT:    lw a2, %lo(G)(a4)
+; RV32IZFINXZDINX-NEXT:    lw a3, %lo(G+4)(a4)
+; RV32IZFINXZDINX-NEXT:    addi a2, a4, %lo(G)
+; RV32IZFINXZDINX-NEXT:    sw a0, %lo(G)(a4)
+; RV32IZFINXZDINX-NEXT:    sw a1, %lo(G+4)(a4)
+; RV32IZFINXZDINX-NEXT:    lw a4, 72(a2)
+; RV32IZFINXZDINX-NEXT:    lw a5, 76(a2)
+; RV32IZFINXZDINX-NEXT:    sw a0, 72(a2)
+; RV32IZFINXZDINX-NEXT:    sw a1, 76(a2)
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fld_fsd_global:

diff  --git a/llvm/test/CodeGen/RISCV/double-previous-failure.ll b/llvm/test/CodeGen/RISCV/double-previous-failure.ll
index c169b1099b273a..c5a7ee79364c65 100644
--- a/llvm/test/CodeGen/RISCV/double-previous-failure.ll
+++ b/llvm/test/CodeGen/RISCV/double-previous-failure.ll
@@ -28,8 +28,8 @@ define i32 @main() nounwind {
 ; RV32IFD-NEXT:    call test
 ; RV32IFD-NEXT:    sw a0, 0(sp)
 ; RV32IFD-NEXT:    sw a1, 4(sp)
-; RV32IFD-NEXT:    fld fa5, 0(sp)
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI1_0)
+; RV32IFD-NEXT:    fld fa5, 0(sp)
 ; RV32IFD-NEXT:    fld fa4, %lo(.LCPI1_0)(a0)
 ; RV32IFD-NEXT:    flt.d a0, fa5, fa4
 ; RV32IFD-NEXT:    bnez a0, .LBB1_3

diff  --git a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
index 0839f61b2d7936..cd87f2d2301d7c 100644
--- a/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/double-round-conv-sat.ll
@@ -53,7 +53,7 @@ define i64 @test_floor_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a4, 524288
+; RV32IFD-NEXT:    lui a3, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
 ; RV32IFD-NEXT:    beqz s0, .LBB1_2
 ; RV32IFD-NEXT:  # %bb.1:
@@ -61,19 +61,19 @@ define i64 @test_floor_si64(double %x) nounwind {
 ; RV32IFD-NEXT:  .LBB1_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI1_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI1_1)(a1)
-; RV32IFD-NEXT:    flt.d a3, fa5, fs0
-; RV32IFD-NEXT:    beqz a3, .LBB1_4
+; RV32IFD-NEXT:    flt.d a1, fa5, fs0
+; RV32IFD-NEXT:    beqz a1, .LBB1_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a4, -1
+; RV32IFD-NEXT:    addi a2, a3, -1
 ; RV32IFD-NEXT:  .LBB1_4:
-; RV32IFD-NEXT:    feq.d a1, fs0, fs0
+; RV32IFD-NEXT:    feq.d a3, fs0, fs0
 ; RV32IFD-NEXT:    neg a4, a1
-; RV32IFD-NEXT:    and a1, a4, a2
-; RV32IFD-NEXT:    neg a2, a3
-; RV32IFD-NEXT:    neg a3, s0
+; RV32IFD-NEXT:    neg a1, s0
+; RV32IFD-NEXT:    neg a3, a3
+; RV32IFD-NEXT:    and a0, a1, a0
+; RV32IFD-NEXT:    and a1, a3, a2
+; RV32IFD-NEXT:    or a0, a4, a0
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    or a0, a2, a0
-; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -100,31 +100,31 @@ define i64 @test_floor_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    mv s1, a1
 ; RV32IZFINXZDINX-NEXT:    call __fixdfdi
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI1_0)
-; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI1_0+4)(a2)
-; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI1_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI1_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI1_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI1_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a6
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI1_1)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI1_0)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI1_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI1_1)(a3)
+; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI1_1+4)(a3)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
+; RV32IZFINXZDINX-NEXT:    flt.d a3, a2, s0
 ; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, 524288
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    neg a5, a6
+; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    neg a5, a3
+; RV32IZFINXZDINX-NEXT:    or a0, a5, a0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    lui a3, 524288
 ; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB1_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
-; RV32IZFINXZDINX-NEXT:    mv a3, a1
+; RV32IZFINXZDINX-NEXT:    mv a5, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB1_2:
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB1_4
+; RV32IZFINXZDINX-NEXT:    beqz a3, .LBB1_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
-; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
+; RV32IZFINXZDINX-NEXT:    addi a5, a4, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB1_4:
-; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a5
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -186,15 +186,15 @@ define i64 @test_floor_ui64(double %x) nounwind {
 ; RV32IFD-NEXT:    call floor
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI3_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI3_0)(a0)
-; RV32IFD-NEXT:    flt.d a0, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a0
-; RV32IFD-NEXT:    fcvt.d.w fa5, zero
-; RV32IFD-NEXT:    fle.d a0, fa5, fa0
+; RV32IFD-NEXT:    fcvt.d.w fa4, zero
+; RV32IFD-NEXT:    fle.d a0, fa4, fa0
+; RV32IFD-NEXT:    flt.d a1, fa5, fa0
+; RV32IFD-NEXT:    neg s0, a1
 ; RV32IFD-NEXT:    neg s1, a0
 ; RV32IFD-NEXT:    call __fixunsdfdi
 ; RV32IFD-NEXT:    and a0, s1, a0
-; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    and a1, s1, a1
+; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    or a1, s0, a1
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -223,16 +223,16 @@ define i64 @test_floor_ui64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    call __fixunsdfdi
 ; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
 ; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI3_0)
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI3_0+4)(a4)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI3_0)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a3, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a3, a3
-; RV32IZFINXZDINX-NEXT:    or a0, a3, a0
 ; RV32IZFINXZDINX-NEXT:    and a1, a2, a1
-; RV32IZFINXZDINX-NEXT:    or a1, a3, a1
+; RV32IZFINXZDINX-NEXT:    flt.d a2, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    or a1, a2, a1
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -297,7 +297,7 @@ define i64 @test_ceil_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a4, 524288
+; RV32IFD-NEXT:    lui a3, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
 ; RV32IFD-NEXT:    beqz s0, .LBB5_2
 ; RV32IFD-NEXT:  # %bb.1:
@@ -305,19 +305,19 @@ define i64 @test_ceil_si64(double %x) nounwind {
 ; RV32IFD-NEXT:  .LBB5_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI5_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI5_1)(a1)
-; RV32IFD-NEXT:    flt.d a3, fa5, fs0
-; RV32IFD-NEXT:    beqz a3, .LBB5_4
+; RV32IFD-NEXT:    flt.d a1, fa5, fs0
+; RV32IFD-NEXT:    beqz a1, .LBB5_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a4, -1
+; RV32IFD-NEXT:    addi a2, a3, -1
 ; RV32IFD-NEXT:  .LBB5_4:
-; RV32IFD-NEXT:    feq.d a1, fs0, fs0
+; RV32IFD-NEXT:    feq.d a3, fs0, fs0
 ; RV32IFD-NEXT:    neg a4, a1
-; RV32IFD-NEXT:    and a1, a4, a2
-; RV32IFD-NEXT:    neg a2, a3
-; RV32IFD-NEXT:    neg a3, s0
+; RV32IFD-NEXT:    neg a1, s0
+; RV32IFD-NEXT:    neg a3, a3
+; RV32IFD-NEXT:    and a0, a1, a0
+; RV32IFD-NEXT:    and a1, a3, a2
+; RV32IFD-NEXT:    or a0, a4, a0
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    or a0, a2, a0
-; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -344,31 +344,31 @@ define i64 @test_ceil_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    mv s1, a1
 ; RV32IZFINXZDINX-NEXT:    call __fixdfdi
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI5_0)
-; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI5_0+4)(a2)
-; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI5_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI5_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI5_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI5_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a6
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI5_1)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI5_0)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI5_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI5_1)(a3)
+; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI5_1+4)(a3)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
+; RV32IZFINXZDINX-NEXT:    flt.d a3, a2, s0
 ; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, 524288
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    neg a5, a6
+; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    neg a5, a3
+; RV32IZFINXZDINX-NEXT:    or a0, a5, a0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    lui a3, 524288
 ; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB5_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
-; RV32IZFINXZDINX-NEXT:    mv a3, a1
+; RV32IZFINXZDINX-NEXT:    mv a5, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB5_2:
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB5_4
+; RV32IZFINXZDINX-NEXT:    beqz a3, .LBB5_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
-; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
+; RV32IZFINXZDINX-NEXT:    addi a5, a4, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB5_4:
-; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a5
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -430,15 +430,15 @@ define i64 @test_ceil_ui64(double %x) nounwind {
 ; RV32IFD-NEXT:    call ceil
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI7_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI7_0)(a0)
-; RV32IFD-NEXT:    flt.d a0, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a0
-; RV32IFD-NEXT:    fcvt.d.w fa5, zero
-; RV32IFD-NEXT:    fle.d a0, fa5, fa0
+; RV32IFD-NEXT:    fcvt.d.w fa4, zero
+; RV32IFD-NEXT:    fle.d a0, fa4, fa0
+; RV32IFD-NEXT:    flt.d a1, fa5, fa0
+; RV32IFD-NEXT:    neg s0, a1
 ; RV32IFD-NEXT:    neg s1, a0
 ; RV32IFD-NEXT:    call __fixunsdfdi
 ; RV32IFD-NEXT:    and a0, s1, a0
-; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    and a1, s1, a1
+; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    or a1, s0, a1
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -467,16 +467,16 @@ define i64 @test_ceil_ui64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    call __fixunsdfdi
 ; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
 ; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI7_0)
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI7_0+4)(a4)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI7_0)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a3, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a3, a3
-; RV32IZFINXZDINX-NEXT:    or a0, a3, a0
 ; RV32IZFINXZDINX-NEXT:    and a1, a2, a1
-; RV32IZFINXZDINX-NEXT:    or a1, a3, a1
+; RV32IZFINXZDINX-NEXT:    flt.d a2, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    or a1, a2, a1
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -541,7 +541,7 @@ define i64 @test_trunc_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a4, 524288
+; RV32IFD-NEXT:    lui a3, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
 ; RV32IFD-NEXT:    beqz s0, .LBB9_2
 ; RV32IFD-NEXT:  # %bb.1:
@@ -549,19 +549,19 @@ define i64 @test_trunc_si64(double %x) nounwind {
 ; RV32IFD-NEXT:  .LBB9_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI9_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI9_1)(a1)
-; RV32IFD-NEXT:    flt.d a3, fa5, fs0
-; RV32IFD-NEXT:    beqz a3, .LBB9_4
+; RV32IFD-NEXT:    flt.d a1, fa5, fs0
+; RV32IFD-NEXT:    beqz a1, .LBB9_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a4, -1
+; RV32IFD-NEXT:    addi a2, a3, -1
 ; RV32IFD-NEXT:  .LBB9_4:
-; RV32IFD-NEXT:    feq.d a1, fs0, fs0
+; RV32IFD-NEXT:    feq.d a3, fs0, fs0
 ; RV32IFD-NEXT:    neg a4, a1
-; RV32IFD-NEXT:    and a1, a4, a2
-; RV32IFD-NEXT:    neg a2, a3
-; RV32IFD-NEXT:    neg a3, s0
+; RV32IFD-NEXT:    neg a1, s0
+; RV32IFD-NEXT:    neg a3, a3
+; RV32IFD-NEXT:    and a0, a1, a0
+; RV32IFD-NEXT:    and a1, a3, a2
+; RV32IFD-NEXT:    or a0, a4, a0
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    or a0, a2, a0
-; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -588,31 +588,31 @@ define i64 @test_trunc_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    mv s1, a1
 ; RV32IZFINXZDINX-NEXT:    call __fixdfdi
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI9_0)
-; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI9_0+4)(a2)
-; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI9_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI9_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI9_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI9_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a6
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI9_1)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI9_0)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI9_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI9_1)(a3)
+; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI9_1+4)(a3)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
+; RV32IZFINXZDINX-NEXT:    flt.d a3, a2, s0
 ; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, 524288
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    neg a5, a6
+; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    neg a5, a3
+; RV32IZFINXZDINX-NEXT:    or a0, a5, a0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    lui a3, 524288
 ; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB9_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
-; RV32IZFINXZDINX-NEXT:    mv a3, a1
+; RV32IZFINXZDINX-NEXT:    mv a5, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB9_2:
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB9_4
+; RV32IZFINXZDINX-NEXT:    beqz a3, .LBB9_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
-; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
+; RV32IZFINXZDINX-NEXT:    addi a5, a4, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB9_4:
-; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a5
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -674,15 +674,15 @@ define i64 @test_trunc_ui64(double %x) nounwind {
 ; RV32IFD-NEXT:    call trunc
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI11_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI11_0)(a0)
-; RV32IFD-NEXT:    flt.d a0, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a0
-; RV32IFD-NEXT:    fcvt.d.w fa5, zero
-; RV32IFD-NEXT:    fle.d a0, fa5, fa0
+; RV32IFD-NEXT:    fcvt.d.w fa4, zero
+; RV32IFD-NEXT:    fle.d a0, fa4, fa0
+; RV32IFD-NEXT:    flt.d a1, fa5, fa0
+; RV32IFD-NEXT:    neg s0, a1
 ; RV32IFD-NEXT:    neg s1, a0
 ; RV32IFD-NEXT:    call __fixunsdfdi
 ; RV32IFD-NEXT:    and a0, s1, a0
-; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    and a1, s1, a1
+; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    or a1, s0, a1
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -711,16 +711,16 @@ define i64 @test_trunc_ui64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    call __fixunsdfdi
 ; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
 ; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI11_0)
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI11_0+4)(a4)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI11_0)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a3, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a3, a3
-; RV32IZFINXZDINX-NEXT:    or a0, a3, a0
 ; RV32IZFINXZDINX-NEXT:    and a1, a2, a1
-; RV32IZFINXZDINX-NEXT:    or a1, a3, a1
+; RV32IZFINXZDINX-NEXT:    flt.d a2, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    or a1, a2, a1
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -785,7 +785,7 @@ define i64 @test_round_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a4, 524288
+; RV32IFD-NEXT:    lui a3, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
 ; RV32IFD-NEXT:    beqz s0, .LBB13_2
 ; RV32IFD-NEXT:  # %bb.1:
@@ -793,19 +793,19 @@ define i64 @test_round_si64(double %x) nounwind {
 ; RV32IFD-NEXT:  .LBB13_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI13_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI13_1)(a1)
-; RV32IFD-NEXT:    flt.d a3, fa5, fs0
-; RV32IFD-NEXT:    beqz a3, .LBB13_4
+; RV32IFD-NEXT:    flt.d a1, fa5, fs0
+; RV32IFD-NEXT:    beqz a1, .LBB13_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a4, -1
+; RV32IFD-NEXT:    addi a2, a3, -1
 ; RV32IFD-NEXT:  .LBB13_4:
-; RV32IFD-NEXT:    feq.d a1, fs0, fs0
+; RV32IFD-NEXT:    feq.d a3, fs0, fs0
 ; RV32IFD-NEXT:    neg a4, a1
-; RV32IFD-NEXT:    and a1, a4, a2
-; RV32IFD-NEXT:    neg a2, a3
-; RV32IFD-NEXT:    neg a3, s0
+; RV32IFD-NEXT:    neg a1, s0
+; RV32IFD-NEXT:    neg a3, a3
+; RV32IFD-NEXT:    and a0, a1, a0
+; RV32IFD-NEXT:    and a1, a3, a2
+; RV32IFD-NEXT:    or a0, a4, a0
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    or a0, a2, a0
-; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -832,31 +832,31 @@ define i64 @test_round_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    mv s1, a1
 ; RV32IZFINXZDINX-NEXT:    call __fixdfdi
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI13_0)
-; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI13_0+4)(a2)
-; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI13_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI13_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI13_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI13_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a6
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI13_1)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI13_0)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI13_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI13_1)(a3)
+; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI13_1+4)(a3)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
+; RV32IZFINXZDINX-NEXT:    flt.d a3, a2, s0
 ; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, 524288
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    neg a5, a6
+; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    neg a5, a3
+; RV32IZFINXZDINX-NEXT:    or a0, a5, a0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    lui a3, 524288
 ; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB13_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
-; RV32IZFINXZDINX-NEXT:    mv a3, a1
+; RV32IZFINXZDINX-NEXT:    mv a5, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB13_2:
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB13_4
+; RV32IZFINXZDINX-NEXT:    beqz a3, .LBB13_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
-; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
+; RV32IZFINXZDINX-NEXT:    addi a5, a4, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB13_4:
-; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a5
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -918,15 +918,15 @@ define i64 @test_round_ui64(double %x) nounwind {
 ; RV32IFD-NEXT:    call round
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI15_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI15_0)(a0)
-; RV32IFD-NEXT:    flt.d a0, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a0
-; RV32IFD-NEXT:    fcvt.d.w fa5, zero
-; RV32IFD-NEXT:    fle.d a0, fa5, fa0
+; RV32IFD-NEXT:    fcvt.d.w fa4, zero
+; RV32IFD-NEXT:    fle.d a0, fa4, fa0
+; RV32IFD-NEXT:    flt.d a1, fa5, fa0
+; RV32IFD-NEXT:    neg s0, a1
 ; RV32IFD-NEXT:    neg s1, a0
 ; RV32IFD-NEXT:    call __fixunsdfdi
 ; RV32IFD-NEXT:    and a0, s1, a0
-; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    and a1, s1, a1
+; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    or a1, s0, a1
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -955,16 +955,16 @@ define i64 @test_round_ui64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    call __fixunsdfdi
 ; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
 ; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI15_0)
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI15_0+4)(a4)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI15_0)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a3, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a3, a3
-; RV32IZFINXZDINX-NEXT:    or a0, a3, a0
 ; RV32IZFINXZDINX-NEXT:    and a1, a2, a1
-; RV32IZFINXZDINX-NEXT:    or a1, a3, a1
+; RV32IZFINXZDINX-NEXT:    flt.d a2, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    or a1, a2, a1
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1029,7 +1029,7 @@ define i64 @test_roundeven_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a4, 524288
+; RV32IFD-NEXT:    lui a3, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
 ; RV32IFD-NEXT:    beqz s0, .LBB17_2
 ; RV32IFD-NEXT:  # %bb.1:
@@ -1037,19 +1037,19 @@ define i64 @test_roundeven_si64(double %x) nounwind {
 ; RV32IFD-NEXT:  .LBB17_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI17_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI17_1)(a1)
-; RV32IFD-NEXT:    flt.d a3, fa5, fs0
-; RV32IFD-NEXT:    beqz a3, .LBB17_4
+; RV32IFD-NEXT:    flt.d a1, fa5, fs0
+; RV32IFD-NEXT:    beqz a1, .LBB17_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a4, -1
+; RV32IFD-NEXT:    addi a2, a3, -1
 ; RV32IFD-NEXT:  .LBB17_4:
-; RV32IFD-NEXT:    feq.d a1, fs0, fs0
+; RV32IFD-NEXT:    feq.d a3, fs0, fs0
 ; RV32IFD-NEXT:    neg a4, a1
-; RV32IFD-NEXT:    and a1, a4, a2
-; RV32IFD-NEXT:    neg a2, a3
-; RV32IFD-NEXT:    neg a3, s0
+; RV32IFD-NEXT:    neg a1, s0
+; RV32IFD-NEXT:    neg a3, a3
+; RV32IFD-NEXT:    and a0, a1, a0
+; RV32IFD-NEXT:    and a1, a3, a2
+; RV32IFD-NEXT:    or a0, a4, a0
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    or a0, a2, a0
-; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -1076,31 +1076,31 @@ define i64 @test_roundeven_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    mv s1, a1
 ; RV32IZFINXZDINX-NEXT:    call __fixdfdi
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI17_0)
-; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI17_0+4)(a2)
-; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI17_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI17_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI17_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI17_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a6
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI17_1)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI17_0)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI17_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI17_1)(a3)
+; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI17_1+4)(a3)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
+; RV32IZFINXZDINX-NEXT:    flt.d a3, a2, s0
 ; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, 524288
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    neg a5, a6
+; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    neg a5, a3
+; RV32IZFINXZDINX-NEXT:    or a0, a5, a0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    lui a3, 524288
 ; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB17_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
-; RV32IZFINXZDINX-NEXT:    mv a3, a1
+; RV32IZFINXZDINX-NEXT:    mv a5, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB17_2:
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB17_4
+; RV32IZFINXZDINX-NEXT:    beqz a3, .LBB17_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
-; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
+; RV32IZFINXZDINX-NEXT:    addi a5, a4, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB17_4:
-; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a5
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1162,15 +1162,15 @@ define i64 @test_roundeven_ui64(double %x) nounwind {
 ; RV32IFD-NEXT:    call roundeven
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI19_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI19_0)(a0)
-; RV32IFD-NEXT:    flt.d a0, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a0
-; RV32IFD-NEXT:    fcvt.d.w fa5, zero
-; RV32IFD-NEXT:    fle.d a0, fa5, fa0
+; RV32IFD-NEXT:    fcvt.d.w fa4, zero
+; RV32IFD-NEXT:    fle.d a0, fa4, fa0
+; RV32IFD-NEXT:    flt.d a1, fa5, fa0
+; RV32IFD-NEXT:    neg s0, a1
 ; RV32IFD-NEXT:    neg s1, a0
 ; RV32IFD-NEXT:    call __fixunsdfdi
 ; RV32IFD-NEXT:    and a0, s1, a0
-; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    and a1, s1, a1
+; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    or a1, s0, a1
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1199,16 +1199,16 @@ define i64 @test_roundeven_ui64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    call __fixunsdfdi
 ; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
 ; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI19_0)
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI19_0+4)(a4)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI19_0)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a3, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a3, a3
-; RV32IZFINXZDINX-NEXT:    or a0, a3, a0
 ; RV32IZFINXZDINX-NEXT:    and a1, a2, a1
-; RV32IZFINXZDINX-NEXT:    or a1, a3, a1
+; RV32IZFINXZDINX-NEXT:    flt.d a2, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    or a1, a2, a1
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1273,7 +1273,7 @@ define i64 @test_rint_si64(double %x) nounwind {
 ; RV32IFD-NEXT:    fmv.d fs0, fa0
 ; RV32IFD-NEXT:    fle.d s0, fa5, fa0
 ; RV32IFD-NEXT:    call __fixdfdi
-; RV32IFD-NEXT:    lui a4, 524288
+; RV32IFD-NEXT:    lui a3, 524288
 ; RV32IFD-NEXT:    lui a2, 524288
 ; RV32IFD-NEXT:    beqz s0, .LBB21_2
 ; RV32IFD-NEXT:  # %bb.1:
@@ -1281,19 +1281,19 @@ define i64 @test_rint_si64(double %x) nounwind {
 ; RV32IFD-NEXT:  .LBB21_2:
 ; RV32IFD-NEXT:    lui a1, %hi(.LCPI21_1)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI21_1)(a1)
-; RV32IFD-NEXT:    flt.d a3, fa5, fs0
-; RV32IFD-NEXT:    beqz a3, .LBB21_4
+; RV32IFD-NEXT:    flt.d a1, fa5, fs0
+; RV32IFD-NEXT:    beqz a1, .LBB21_4
 ; RV32IFD-NEXT:  # %bb.3:
-; RV32IFD-NEXT:    addi a2, a4, -1
+; RV32IFD-NEXT:    addi a2, a3, -1
 ; RV32IFD-NEXT:  .LBB21_4:
-; RV32IFD-NEXT:    feq.d a1, fs0, fs0
+; RV32IFD-NEXT:    feq.d a3, fs0, fs0
 ; RV32IFD-NEXT:    neg a4, a1
-; RV32IFD-NEXT:    and a1, a4, a2
-; RV32IFD-NEXT:    neg a2, a3
-; RV32IFD-NEXT:    neg a3, s0
+; RV32IFD-NEXT:    neg a1, s0
+; RV32IFD-NEXT:    neg a3, a3
+; RV32IFD-NEXT:    and a0, a1, a0
+; RV32IFD-NEXT:    and a1, a3, a2
+; RV32IFD-NEXT:    or a0, a4, a0
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    or a0, a2, a0
-; RV32IFD-NEXT:    and a0, a4, a0
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -1320,31 +1320,31 @@ define i64 @test_rint_si64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    mv s1, a1
 ; RV32IZFINXZDINX-NEXT:    call __fixdfdi
 ; RV32IZFINXZDINX-NEXT:    lui a2, %hi(.LCPI21_0)
-; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI21_0+4)(a2)
-; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI21_0)(a2)
-; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI21_1)
-; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI21_1+4)(a4)
-; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI21_1)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a6, a2, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a6
-; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a4, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a2, a4
-; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    lui a3, %hi(.LCPI21_1)
+; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI21_0)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI21_0+4)(a2)
+; RV32IZFINXZDINX-NEXT:    lw a2, %lo(.LCPI21_1)(a3)
+; RV32IZFINXZDINX-NEXT:    lw a3, %lo(.LCPI21_1+4)(a3)
+; RV32IZFINXZDINX-NEXT:    fle.d a6, a4, s0
+; RV32IZFINXZDINX-NEXT:    flt.d a3, a2, s0
 ; RV32IZFINXZDINX-NEXT:    feq.d a2, s0, s0
+; RV32IZFINXZDINX-NEXT:    lui a4, 524288
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    neg a5, a6
+; RV32IZFINXZDINX-NEXT:    and a0, a5, a0
+; RV32IZFINXZDINX-NEXT:    neg a5, a3
+; RV32IZFINXZDINX-NEXT:    or a0, a5, a0
 ; RV32IZFINXZDINX-NEXT:    lui a5, 524288
-; RV32IZFINXZDINX-NEXT:    lui a3, 524288
 ; RV32IZFINXZDINX-NEXT:    beqz a6, .LBB21_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
-; RV32IZFINXZDINX-NEXT:    mv a3, a1
+; RV32IZFINXZDINX-NEXT:    mv a5, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB21_2:
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    beqz a4, .LBB21_4
+; RV32IZFINXZDINX-NEXT:    beqz a3, .LBB21_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
-; RV32IZFINXZDINX-NEXT:    addi a3, a5, -1
+; RV32IZFINXZDINX-NEXT:    addi a5, a4, -1
 ; RV32IZFINXZDINX-NEXT:  .LBB21_4:
-; RV32IZFINXZDINX-NEXT:    and a1, a2, a3
+; RV32IZFINXZDINX-NEXT:    and a1, a2, a5
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1406,15 +1406,15 @@ define i64 @test_rint_ui64(double %x) nounwind {
 ; RV32IFD-NEXT:    call rint
 ; RV32IFD-NEXT:    lui a0, %hi(.LCPI23_0)
 ; RV32IFD-NEXT:    fld fa5, %lo(.LCPI23_0)(a0)
-; RV32IFD-NEXT:    flt.d a0, fa5, fa0
-; RV32IFD-NEXT:    neg s0, a0
-; RV32IFD-NEXT:    fcvt.d.w fa5, zero
-; RV32IFD-NEXT:    fle.d a0, fa5, fa0
+; RV32IFD-NEXT:    fcvt.d.w fa4, zero
+; RV32IFD-NEXT:    fle.d a0, fa4, fa0
+; RV32IFD-NEXT:    flt.d a1, fa5, fa0
+; RV32IFD-NEXT:    neg s0, a1
 ; RV32IFD-NEXT:    neg s1, a0
 ; RV32IFD-NEXT:    call __fixunsdfdi
 ; RV32IFD-NEXT:    and a0, s1, a0
-; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    and a1, s1, a1
+; RV32IFD-NEXT:    or a0, s0, a0
 ; RV32IFD-NEXT:    or a1, s0, a1
 ; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1443,16 +1443,16 @@ define i64 @test_rint_ui64(double %x) nounwind {
 ; RV32IZFINXZDINX-NEXT:    call __fixunsdfdi
 ; RV32IZFINXZDINX-NEXT:    fcvt.d.w a2, zero
 ; RV32IZFINXZDINX-NEXT:    lui a4, %hi(.LCPI23_0)
+; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    lw a5, %lo(.LCPI23_0+4)(a4)
 ; RV32IZFINXZDINX-NEXT:    lw a4, %lo(.LCPI23_0)(a4)
-; RV32IZFINXZDINX-NEXT:    fle.d a2, a2, s0
 ; RV32IZFINXZDINX-NEXT:    neg a2, a2
 ; RV32IZFINXZDINX-NEXT:    and a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    flt.d a3, a4, s0
-; RV32IZFINXZDINX-NEXT:    neg a3, a3
-; RV32IZFINXZDINX-NEXT:    or a0, a3, a0
 ; RV32IZFINXZDINX-NEXT:    and a1, a2, a1
-; RV32IZFINXZDINX-NEXT:    or a1, a3, a1
+; RV32IZFINXZDINX-NEXT:    flt.d a2, a4, s0
+; RV32IZFINXZDINX-NEXT:    neg a2, a2
+; RV32IZFINXZDINX-NEXT:    or a0, a2, a0
+; RV32IZFINXZDINX-NEXT:    or a1, a2, a1
 ; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINXZDINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/double-select-fcmp.ll b/llvm/test/CodeGen/RISCV/double-select-fcmp.ll
index 654a4609caa23f..e7ff991413013b 100644
--- a/llvm/test/CodeGen/RISCV/double-select-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/double-select-fcmp.ll
@@ -577,9 +577,9 @@ define i32 @select_fcmp_oeq_1_2(double %a, double %b) {
 ;
 ; CHECKRV32ZDINX-LABEL: select_fcmp_oeq_1_2:
 ; CHECKRV32ZDINX:       # %bb.0:
+; CHECKRV32ZDINX-NEXT:    li a4, 2
 ; CHECKRV32ZDINX-NEXT:    feq.d a0, a0, a2
-; CHECKRV32ZDINX-NEXT:    li a1, 2
-; CHECKRV32ZDINX-NEXT:    sub a0, a1, a0
+; CHECKRV32ZDINX-NEXT:    sub a0, a4, a0
 ; CHECKRV32ZDINX-NEXT:    ret
 ;
 ; CHECKRV64ZDINX-LABEL: select_fcmp_oeq_1_2:

diff  --git a/llvm/test/CodeGen/RISCV/double_reduct.ll b/llvm/test/CodeGen/RISCV/double_reduct.ll
index 25228b21ef0554..cecdd77a079e42 100644
--- a/llvm/test/CodeGen/RISCV/double_reduct.ll
+++ b/llvm/test/CodeGen/RISCV/double_reduct.ll
@@ -25,14 +25,14 @@ define float @fmul_f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 2
 ; CHECK-NEXT:    vfmul.vv v8, v8, v10
+; CHECK-NEXT:    vslidedown.vi v10, v9, 2
+; CHECK-NEXT:    vfmul.vv v9, v9, v10
 ; CHECK-NEXT:    vrgather.vi v10, v8, 1
 ; CHECK-NEXT:    vfmul.vv v8, v8, v10
+; CHECK-NEXT:    vrgather.vi v10, v9, 1
+; CHECK-NEXT:    vfmul.vv v9, v9, v10
 ; CHECK-NEXT:    vfmv.f.s fa5, v8
-; CHECK-NEXT:    vslidedown.vi v8, v9, 2
-; CHECK-NEXT:    vfmul.vv v8, v9, v8
-; CHECK-NEXT:    vrgather.vi v9, v8, 1
-; CHECK-NEXT:    vfmul.vv v8, v8, v9
-; CHECK-NEXT:    vfmv.f.s fa4, v8
+; CHECK-NEXT:    vfmv.f.s fa4, v9
 ; CHECK-NEXT:    fmul.s fa0, fa5, fa4
 ; CHECK-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
@@ -130,14 +130,14 @@ define i32 @mul_i32(<4 x i32> %a, <4 x i32> %b) {
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32-NEXT:    vmul.vv v8, v8, v10
+; RV32-NEXT:    vslidedown.vi v10, v9, 2
+; RV32-NEXT:    vmul.vv v9, v9, v10
 ; RV32-NEXT:    vrgather.vi v10, v8, 1
 ; RV32-NEXT:    vmul.vv v8, v8, v10
+; RV32-NEXT:    vrgather.vi v10, v9, 1
+; RV32-NEXT:    vmul.vv v9, v9, v10
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    vslidedown.vi v8, v9, 2
-; RV32-NEXT:    vmul.vv v8, v9, v8
-; RV32-NEXT:    vrgather.vi v9, v8, 1
-; RV32-NEXT:    vmul.vv v8, v8, v9
-; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    vmv.x.s a1, v9
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    ret
 ;
@@ -146,14 +146,14 @@ define i32 @mul_i32(<4 x i32> %a, <4 x i32> %b) {
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v10, v8, 2
 ; RV64-NEXT:    vmul.vv v8, v8, v10
+; RV64-NEXT:    vslidedown.vi v10, v9, 2
+; RV64-NEXT:    vmul.vv v9, v9, v10
 ; RV64-NEXT:    vrgather.vi v10, v8, 1
 ; RV64-NEXT:    vmul.vv v8, v8, v10
+; RV64-NEXT:    vrgather.vi v10, v9, 1
+; RV64-NEXT:    vmul.vv v9, v9, v10
 ; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    vslidedown.vi v8, v9, 2
-; RV64-NEXT:    vmul.vv v8, v9, v8
-; RV64-NEXT:    vrgather.vi v9, v8, 1
-; RV64-NEXT:    vmul.vv v8, v8, v9
-; RV64-NEXT:    vmv.x.s a1, v8
+; RV64-NEXT:    vmv.x.s a1, v9
 ; RV64-NEXT:    mulw a0, a0, a1
 ; RV64-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %a)

diff  --git a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
index 0a3b5d47e5650a..26e86d41176e04 100644
--- a/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
+++ b/llvm/test/CodeGen/RISCV/early-clobber-tied-def-subreg-liveness.ll
@@ -49,10 +49,10 @@ define void @_Z3foov() {
 ; CHECK-NEXT:    vs2r.v v12, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vs2r.v v14, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    #APP
-; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    lui a0, %hi(.L__const._Z3foov.var_40)
 ; CHECK-NEXT:    addi a0, a0, %lo(.L__const._Z3foov.var_40)
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    lui a0, 1048572

diff  --git a/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll b/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll
index 7523119c4ff778..8a91c46bcdaff0 100644
--- a/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll
+++ b/llvm/test/CodeGen/RISCV/fastcc-without-f-reg.ll
@@ -280,17 +280,20 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX32-NEXT:    sw s9, 68(sp) # 4-byte Folded Spill
 ; ZHINX32-NEXT:    sw s10, 64(sp) # 4-byte Folded Spill
 ; ZHINX32-NEXT:    sw s11, 60(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lh t0, 112(sp)
-; ZHINX32-NEXT:    sh t0, 58(sp) # 2-byte Folded Spill
-; ZHINX32-NEXT:    lh t0, 116(sp)
-; ZHINX32-NEXT:    sh t0, 56(sp) # 2-byte Folded Spill
-; ZHINX32-NEXT:    lh t0, 120(sp)
-; ZHINX32-NEXT:    sh t0, 54(sp) # 2-byte Folded Spill
-; ZHINX32-NEXT:    lh t0, 124(sp)
-; ZHINX32-NEXT:    sh t0, 52(sp) # 2-byte Folded Spill
-; ZHINX32-NEXT:    lh t6, 128(sp)
-; ZHINX32-NEXT:    lh t4, 132(sp)
-; ZHINX32-NEXT:    lh t5, 136(sp)
+; ZHINX32-NEXT:    sh a7, 58(sp) # 2-byte Folded Spill
+; ZHINX32-NEXT:    sh a6, 56(sp) # 2-byte Folded Spill
+; ZHINX32-NEXT:    sh a5, 54(sp) # 2-byte Folded Spill
+; ZHINX32-NEXT:    sh a4, 52(sp) # 2-byte Folded Spill
+; ZHINX32-NEXT:    mv a7, a3
+; ZHINX32-NEXT:    mv a6, a2
+; ZHINX32-NEXT:    mv a5, a1
+; ZHINX32-NEXT:    lh t3, 112(sp)
+; ZHINX32-NEXT:    lh t4, 116(sp)
+; ZHINX32-NEXT:    lh t5, 120(sp)
+; ZHINX32-NEXT:    lh t6, 124(sp)
+; ZHINX32-NEXT:    lh t0, 128(sp)
+; ZHINX32-NEXT:    lh t1, 132(sp)
+; ZHINX32-NEXT:    lh t2, 136(sp)
 ; ZHINX32-NEXT:    lh s0, 140(sp)
 ; ZHINX32-NEXT:    lh s1, 144(sp)
 ; ZHINX32-NEXT:    lh s2, 148(sp)
@@ -304,14 +307,14 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX32-NEXT:    lh s10, 180(sp)
 ; ZHINX32-NEXT:    lh s11, 184(sp)
 ; ZHINX32-NEXT:    lh ra, 188(sp)
-; ZHINX32-NEXT:    lh t0, 192(sp)
-; ZHINX32-NEXT:    lh t1, 196(sp)
-; ZHINX32-NEXT:    lh t2, 200(sp)
-; ZHINX32-NEXT:    lh t3, 204(sp)
-; ZHINX32-NEXT:    sh t0, 32(sp)
-; ZHINX32-NEXT:    sh t1, 34(sp)
-; ZHINX32-NEXT:    sh t2, 36(sp)
-; ZHINX32-NEXT:    sh t3, 38(sp)
+; ZHINX32-NEXT:    lh a1, 192(sp)
+; ZHINX32-NEXT:    lh a2, 196(sp)
+; ZHINX32-NEXT:    lh a3, 200(sp)
+; ZHINX32-NEXT:    lh a4, 204(sp)
+; ZHINX32-NEXT:    sh a1, 32(sp)
+; ZHINX32-NEXT:    sh a2, 34(sp)
+; ZHINX32-NEXT:    sh a3, 36(sp)
+; ZHINX32-NEXT:    sh a4, 38(sp)
 ; ZHINX32-NEXT:    sh s9, 24(sp)
 ; ZHINX32-NEXT:    sh s10, 26(sp)
 ; ZHINX32-NEXT:    sh s11, 28(sp)
@@ -324,14 +327,17 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX32-NEXT:    sh s2, 10(sp)
 ; ZHINX32-NEXT:    sh s3, 12(sp)
 ; ZHINX32-NEXT:    sh s4, 14(sp)
-; ZHINX32-NEXT:    sh t6, 0(sp)
-; ZHINX32-NEXT:    sh t4, 2(sp)
-; ZHINX32-NEXT:    sh t5, 4(sp)
+; ZHINX32-NEXT:    sh t0, 0(sp)
+; ZHINX32-NEXT:    sh t1, 2(sp)
+; ZHINX32-NEXT:    sh t2, 4(sp)
 ; ZHINX32-NEXT:    sh s0, 6(sp)
-; ZHINX32-NEXT:    lh t3, 58(sp) # 2-byte Folded Reload
-; ZHINX32-NEXT:    lh t4, 56(sp) # 2-byte Folded Reload
-; ZHINX32-NEXT:    lh t5, 54(sp) # 2-byte Folded Reload
-; ZHINX32-NEXT:    lh t6, 52(sp) # 2-byte Folded Reload
+; ZHINX32-NEXT:    mv a1, a5
+; ZHINX32-NEXT:    mv a2, a6
+; ZHINX32-NEXT:    mv a3, a7
+; ZHINX32-NEXT:    lh a4, 52(sp) # 2-byte Folded Reload
+; ZHINX32-NEXT:    lh a5, 54(sp) # 2-byte Folded Reload
+; ZHINX32-NEXT:    lh a6, 56(sp) # 2-byte Folded Reload
+; ZHINX32-NEXT:    lh a7, 58(sp) # 2-byte Folded Reload
 ; ZHINX32-NEXT:    call callee_half_32
 ; ZHINX32-NEXT:    lw ra, 108(sp) # 4-byte Folded Reload
 ; ZHINX32-NEXT:    lw s0, 104(sp) # 4-byte Folded Reload
@@ -365,17 +371,20 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX64-NEXT:    sd s9, 72(sp) # 8-byte Folded Spill
 ; ZHINX64-NEXT:    sd s10, 64(sp) # 8-byte Folded Spill
 ; ZHINX64-NEXT:    sd s11, 56(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    lh t0, 160(sp)
-; ZHINX64-NEXT:    sh t0, 54(sp) # 2-byte Folded Spill
-; ZHINX64-NEXT:    lh t0, 168(sp)
-; ZHINX64-NEXT:    sh t0, 52(sp) # 2-byte Folded Spill
-; ZHINX64-NEXT:    lh t0, 176(sp)
-; ZHINX64-NEXT:    sh t0, 50(sp) # 2-byte Folded Spill
-; ZHINX64-NEXT:    lh t0, 184(sp)
-; ZHINX64-NEXT:    sh t0, 48(sp) # 2-byte Folded Spill
-; ZHINX64-NEXT:    lh t6, 192(sp)
-; ZHINX64-NEXT:    lh t4, 200(sp)
-; ZHINX64-NEXT:    lh t5, 208(sp)
+; ZHINX64-NEXT:    sh a7, 54(sp) # 2-byte Folded Spill
+; ZHINX64-NEXT:    sh a6, 52(sp) # 2-byte Folded Spill
+; ZHINX64-NEXT:    sh a5, 50(sp) # 2-byte Folded Spill
+; ZHINX64-NEXT:    sh a4, 48(sp) # 2-byte Folded Spill
+; ZHINX64-NEXT:    mv a7, a3
+; ZHINX64-NEXT:    mv a6, a2
+; ZHINX64-NEXT:    mv a5, a1
+; ZHINX64-NEXT:    lh t3, 160(sp)
+; ZHINX64-NEXT:    lh t4, 168(sp)
+; ZHINX64-NEXT:    lh t5, 176(sp)
+; ZHINX64-NEXT:    lh t6, 184(sp)
+; ZHINX64-NEXT:    lh t0, 192(sp)
+; ZHINX64-NEXT:    lh t1, 200(sp)
+; ZHINX64-NEXT:    lh t2, 208(sp)
 ; ZHINX64-NEXT:    lh s0, 216(sp)
 ; ZHINX64-NEXT:    lh s1, 224(sp)
 ; ZHINX64-NEXT:    lh s2, 232(sp)
@@ -389,14 +398,14 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX64-NEXT:    lh s10, 296(sp)
 ; ZHINX64-NEXT:    lh s11, 304(sp)
 ; ZHINX64-NEXT:    lh ra, 312(sp)
-; ZHINX64-NEXT:    lh t0, 320(sp)
-; ZHINX64-NEXT:    lh t1, 328(sp)
-; ZHINX64-NEXT:    lh t2, 336(sp)
-; ZHINX64-NEXT:    lh t3, 344(sp)
-; ZHINX64-NEXT:    sh t0, 32(sp)
-; ZHINX64-NEXT:    sh t1, 34(sp)
-; ZHINX64-NEXT:    sh t2, 36(sp)
-; ZHINX64-NEXT:    sh t3, 38(sp)
+; ZHINX64-NEXT:    lh a1, 320(sp)
+; ZHINX64-NEXT:    lh a2, 328(sp)
+; ZHINX64-NEXT:    lh a3, 336(sp)
+; ZHINX64-NEXT:    lh a4, 344(sp)
+; ZHINX64-NEXT:    sh a1, 32(sp)
+; ZHINX64-NEXT:    sh a2, 34(sp)
+; ZHINX64-NEXT:    sh a3, 36(sp)
+; ZHINX64-NEXT:    sh a4, 38(sp)
 ; ZHINX64-NEXT:    sh s9, 24(sp)
 ; ZHINX64-NEXT:    sh s10, 26(sp)
 ; ZHINX64-NEXT:    sh s11, 28(sp)
@@ -409,14 +418,17 @@ define half @caller_half_32(<32 x half> %A) nounwind {
 ; ZHINX64-NEXT:    sh s2, 10(sp)
 ; ZHINX64-NEXT:    sh s3, 12(sp)
 ; ZHINX64-NEXT:    sh s4, 14(sp)
-; ZHINX64-NEXT:    sh t6, 0(sp)
-; ZHINX64-NEXT:    sh t4, 2(sp)
-; ZHINX64-NEXT:    sh t5, 4(sp)
+; ZHINX64-NEXT:    sh t0, 0(sp)
+; ZHINX64-NEXT:    sh t1, 2(sp)
+; ZHINX64-NEXT:    sh t2, 4(sp)
 ; ZHINX64-NEXT:    sh s0, 6(sp)
-; ZHINX64-NEXT:    lh t3, 54(sp) # 2-byte Folded Reload
-; ZHINX64-NEXT:    lh t4, 52(sp) # 2-byte Folded Reload
-; ZHINX64-NEXT:    lh t5, 50(sp) # 2-byte Folded Reload
-; ZHINX64-NEXT:    lh t6, 48(sp) # 2-byte Folded Reload
+; ZHINX64-NEXT:    mv a1, a5
+; ZHINX64-NEXT:    mv a2, a6
+; ZHINX64-NEXT:    mv a3, a7
+; ZHINX64-NEXT:    lh a4, 48(sp) # 2-byte Folded Reload
+; ZHINX64-NEXT:    lh a5, 50(sp) # 2-byte Folded Reload
+; ZHINX64-NEXT:    lh a6, 52(sp) # 2-byte Folded Reload
+; ZHINX64-NEXT:    lh a7, 54(sp) # 2-byte Folded Reload
 ; ZHINX64-NEXT:    call callee_half_32
 ; ZHINX64-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
 ; ZHINX64-NEXT:    ld s0, 144(sp) # 8-byte Folded Reload
@@ -874,17 +886,20 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX32-NEXT:    sw s9, 116(sp) # 4-byte Folded Spill
 ; ZHINX32-NEXT:    sw s10, 112(sp) # 4-byte Folded Spill
 ; ZHINX32-NEXT:    sw s11, 108(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lw t0, 160(sp)
-; ZHINX32-NEXT:    sw t0, 104(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lw t0, 164(sp)
-; ZHINX32-NEXT:    sw t0, 100(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lw t0, 168(sp)
-; ZHINX32-NEXT:    sw t0, 96(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lw t0, 172(sp)
-; ZHINX32-NEXT:    sw t0, 92(sp) # 4-byte Folded Spill
-; ZHINX32-NEXT:    lw t6, 176(sp)
-; ZHINX32-NEXT:    lw t4, 180(sp)
-; ZHINX32-NEXT:    lw t5, 184(sp)
+; ZHINX32-NEXT:    sw a7, 104(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw a6, 100(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw a5, 96(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    sw a4, 92(sp) # 4-byte Folded Spill
+; ZHINX32-NEXT:    mv a7, a3
+; ZHINX32-NEXT:    mv a6, a2
+; ZHINX32-NEXT:    mv a5, a1
+; ZHINX32-NEXT:    lw t3, 160(sp)
+; ZHINX32-NEXT:    lw t4, 164(sp)
+; ZHINX32-NEXT:    lw t5, 168(sp)
+; ZHINX32-NEXT:    lw t6, 172(sp)
+; ZHINX32-NEXT:    lw t0, 176(sp)
+; ZHINX32-NEXT:    lw t1, 180(sp)
+; ZHINX32-NEXT:    lw t2, 184(sp)
 ; ZHINX32-NEXT:    lw s0, 188(sp)
 ; ZHINX32-NEXT:    lw s1, 192(sp)
 ; ZHINX32-NEXT:    lw s2, 196(sp)
@@ -898,14 +913,14 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX32-NEXT:    lw s10, 228(sp)
 ; ZHINX32-NEXT:    lw s11, 232(sp)
 ; ZHINX32-NEXT:    lw ra, 236(sp)
-; ZHINX32-NEXT:    lw t0, 240(sp)
-; ZHINX32-NEXT:    lw t1, 244(sp)
-; ZHINX32-NEXT:    lw t2, 248(sp)
-; ZHINX32-NEXT:    lw t3, 252(sp)
-; ZHINX32-NEXT:    sw t0, 64(sp)
-; ZHINX32-NEXT:    sw t1, 68(sp)
-; ZHINX32-NEXT:    sw t2, 72(sp)
-; ZHINX32-NEXT:    sw t3, 76(sp)
+; ZHINX32-NEXT:    lw a1, 240(sp)
+; ZHINX32-NEXT:    lw a2, 244(sp)
+; ZHINX32-NEXT:    lw a3, 248(sp)
+; ZHINX32-NEXT:    lw a4, 252(sp)
+; ZHINX32-NEXT:    sw a1, 64(sp)
+; ZHINX32-NEXT:    sw a2, 68(sp)
+; ZHINX32-NEXT:    sw a3, 72(sp)
+; ZHINX32-NEXT:    sw a4, 76(sp)
 ; ZHINX32-NEXT:    sw s9, 48(sp)
 ; ZHINX32-NEXT:    sw s10, 52(sp)
 ; ZHINX32-NEXT:    sw s11, 56(sp)
@@ -918,14 +933,17 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX32-NEXT:    sw s2, 20(sp)
 ; ZHINX32-NEXT:    sw s3, 24(sp)
 ; ZHINX32-NEXT:    sw s4, 28(sp)
-; ZHINX32-NEXT:    sw t6, 0(sp)
-; ZHINX32-NEXT:    sw t4, 4(sp)
-; ZHINX32-NEXT:    sw t5, 8(sp)
+; ZHINX32-NEXT:    sw t0, 0(sp)
+; ZHINX32-NEXT:    sw t1, 4(sp)
+; ZHINX32-NEXT:    sw t2, 8(sp)
 ; ZHINX32-NEXT:    sw s0, 12(sp)
-; ZHINX32-NEXT:    lw t3, 104(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw t4, 100(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw t5, 96(sp) # 4-byte Folded Reload
-; ZHINX32-NEXT:    lw t6, 92(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    mv a1, a5
+; ZHINX32-NEXT:    mv a2, a6
+; ZHINX32-NEXT:    mv a3, a7
+; ZHINX32-NEXT:    lw a4, 92(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw a5, 96(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw a6, 100(sp) # 4-byte Folded Reload
+; ZHINX32-NEXT:    lw a7, 104(sp) # 4-byte Folded Reload
 ; ZHINX32-NEXT:    call callee_float_32
 ; ZHINX32-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
 ; ZHINX32-NEXT:    lw s0, 152(sp) # 4-byte Folded Reload
@@ -959,17 +977,20 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX64-NEXT:    sd s9, 120(sp) # 8-byte Folded Spill
 ; ZHINX64-NEXT:    sd s10, 112(sp) # 8-byte Folded Spill
 ; ZHINX64-NEXT:    sd s11, 104(sp) # 8-byte Folded Spill
-; ZHINX64-NEXT:    lw t0, 208(sp)
-; ZHINX64-NEXT:    sw t0, 100(sp) # 4-byte Folded Spill
-; ZHINX64-NEXT:    lw t0, 216(sp)
-; ZHINX64-NEXT:    sw t0, 96(sp) # 4-byte Folded Spill
-; ZHINX64-NEXT:    lw t0, 224(sp)
-; ZHINX64-NEXT:    sw t0, 92(sp) # 4-byte Folded Spill
-; ZHINX64-NEXT:    lw t0, 232(sp)
-; ZHINX64-NEXT:    sw t0, 88(sp) # 4-byte Folded Spill
-; ZHINX64-NEXT:    lw t6, 240(sp)
-; ZHINX64-NEXT:    lw t4, 248(sp)
-; ZHINX64-NEXT:    lw t5, 256(sp)
+; ZHINX64-NEXT:    sw a7, 100(sp) # 4-byte Folded Spill
+; ZHINX64-NEXT:    sw a6, 96(sp) # 4-byte Folded Spill
+; ZHINX64-NEXT:    sw a5, 92(sp) # 4-byte Folded Spill
+; ZHINX64-NEXT:    sw a4, 88(sp) # 4-byte Folded Spill
+; ZHINX64-NEXT:    mv a7, a3
+; ZHINX64-NEXT:    mv a6, a2
+; ZHINX64-NEXT:    mv a5, a1
+; ZHINX64-NEXT:    lw t3, 208(sp)
+; ZHINX64-NEXT:    lw t4, 216(sp)
+; ZHINX64-NEXT:    lw t5, 224(sp)
+; ZHINX64-NEXT:    lw t6, 232(sp)
+; ZHINX64-NEXT:    lw t0, 240(sp)
+; ZHINX64-NEXT:    lw t1, 248(sp)
+; ZHINX64-NEXT:    lw t2, 256(sp)
 ; ZHINX64-NEXT:    lw s0, 264(sp)
 ; ZHINX64-NEXT:    lw s1, 272(sp)
 ; ZHINX64-NEXT:    lw s2, 280(sp)
@@ -983,14 +1004,14 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX64-NEXT:    lw s10, 344(sp)
 ; ZHINX64-NEXT:    lw s11, 352(sp)
 ; ZHINX64-NEXT:    lw ra, 360(sp)
-; ZHINX64-NEXT:    lw t0, 368(sp)
-; ZHINX64-NEXT:    lw t1, 376(sp)
-; ZHINX64-NEXT:    lw t2, 384(sp)
-; ZHINX64-NEXT:    lw t3, 392(sp)
-; ZHINX64-NEXT:    sw t0, 64(sp)
-; ZHINX64-NEXT:    sw t1, 68(sp)
-; ZHINX64-NEXT:    sw t2, 72(sp)
-; ZHINX64-NEXT:    sw t3, 76(sp)
+; ZHINX64-NEXT:    lw a1, 368(sp)
+; ZHINX64-NEXT:    lw a2, 376(sp)
+; ZHINX64-NEXT:    lw a3, 384(sp)
+; ZHINX64-NEXT:    lw a4, 392(sp)
+; ZHINX64-NEXT:    sw a1, 64(sp)
+; ZHINX64-NEXT:    sw a2, 68(sp)
+; ZHINX64-NEXT:    sw a3, 72(sp)
+; ZHINX64-NEXT:    sw a4, 76(sp)
 ; ZHINX64-NEXT:    sw s9, 48(sp)
 ; ZHINX64-NEXT:    sw s10, 52(sp)
 ; ZHINX64-NEXT:    sw s11, 56(sp)
@@ -1003,14 +1024,17 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZHINX64-NEXT:    sw s2, 20(sp)
 ; ZHINX64-NEXT:    sw s3, 24(sp)
 ; ZHINX64-NEXT:    sw s4, 28(sp)
-; ZHINX64-NEXT:    sw t6, 0(sp)
-; ZHINX64-NEXT:    sw t4, 4(sp)
-; ZHINX64-NEXT:    sw t5, 8(sp)
+; ZHINX64-NEXT:    sw t0, 0(sp)
+; ZHINX64-NEXT:    sw t1, 4(sp)
+; ZHINX64-NEXT:    sw t2, 8(sp)
 ; ZHINX64-NEXT:    sw s0, 12(sp)
-; ZHINX64-NEXT:    lw t3, 100(sp) # 4-byte Folded Reload
-; ZHINX64-NEXT:    lw t4, 96(sp) # 4-byte Folded Reload
-; ZHINX64-NEXT:    lw t5, 92(sp) # 4-byte Folded Reload
-; ZHINX64-NEXT:    lw t6, 88(sp) # 4-byte Folded Reload
+; ZHINX64-NEXT:    mv a1, a5
+; ZHINX64-NEXT:    mv a2, a6
+; ZHINX64-NEXT:    mv a3, a7
+; ZHINX64-NEXT:    lw a4, 88(sp) # 4-byte Folded Reload
+; ZHINX64-NEXT:    lw a5, 92(sp) # 4-byte Folded Reload
+; ZHINX64-NEXT:    lw a6, 96(sp) # 4-byte Folded Reload
+; ZHINX64-NEXT:    lw a7, 100(sp) # 4-byte Folded Reload
 ; ZHINX64-NEXT:    call callee_float_32
 ; ZHINX64-NEXT:    ld ra, 200(sp) # 8-byte Folded Reload
 ; ZHINX64-NEXT:    ld s0, 192(sp) # 8-byte Folded Reload
@@ -1044,17 +1068,20 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZFINX32-NEXT:    sw s9, 116(sp) # 4-byte Folded Spill
 ; ZFINX32-NEXT:    sw s10, 112(sp) # 4-byte Folded Spill
 ; ZFINX32-NEXT:    sw s11, 108(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    lw t0, 160(sp)
-; ZFINX32-NEXT:    sw t0, 104(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    lw t0, 164(sp)
-; ZFINX32-NEXT:    sw t0, 100(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    lw t0, 168(sp)
-; ZFINX32-NEXT:    sw t0, 96(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    lw t0, 172(sp)
-; ZFINX32-NEXT:    sw t0, 92(sp) # 4-byte Folded Spill
-; ZFINX32-NEXT:    lw t6, 176(sp)
-; ZFINX32-NEXT:    lw t4, 180(sp)
-; ZFINX32-NEXT:    lw t5, 184(sp)
+; ZFINX32-NEXT:    sw a7, 104(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw a6, 100(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw a5, 96(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    sw a4, 92(sp) # 4-byte Folded Spill
+; ZFINX32-NEXT:    mv a7, a3
+; ZFINX32-NEXT:    mv a6, a2
+; ZFINX32-NEXT:    mv a5, a1
+; ZFINX32-NEXT:    lw t3, 160(sp)
+; ZFINX32-NEXT:    lw t4, 164(sp)
+; ZFINX32-NEXT:    lw t5, 168(sp)
+; ZFINX32-NEXT:    lw t6, 172(sp)
+; ZFINX32-NEXT:    lw t0, 176(sp)
+; ZFINX32-NEXT:    lw t1, 180(sp)
+; ZFINX32-NEXT:    lw t2, 184(sp)
 ; ZFINX32-NEXT:    lw s0, 188(sp)
 ; ZFINX32-NEXT:    lw s1, 192(sp)
 ; ZFINX32-NEXT:    lw s2, 196(sp)
@@ -1068,14 +1095,14 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZFINX32-NEXT:    lw s10, 228(sp)
 ; ZFINX32-NEXT:    lw s11, 232(sp)
 ; ZFINX32-NEXT:    lw ra, 236(sp)
-; ZFINX32-NEXT:    lw t0, 240(sp)
-; ZFINX32-NEXT:    lw t1, 244(sp)
-; ZFINX32-NEXT:    lw t2, 248(sp)
-; ZFINX32-NEXT:    lw t3, 252(sp)
-; ZFINX32-NEXT:    sw t0, 64(sp)
-; ZFINX32-NEXT:    sw t1, 68(sp)
-; ZFINX32-NEXT:    sw t2, 72(sp)
-; ZFINX32-NEXT:    sw t3, 76(sp)
+; ZFINX32-NEXT:    lw a1, 240(sp)
+; ZFINX32-NEXT:    lw a2, 244(sp)
+; ZFINX32-NEXT:    lw a3, 248(sp)
+; ZFINX32-NEXT:    lw a4, 252(sp)
+; ZFINX32-NEXT:    sw a1, 64(sp)
+; ZFINX32-NEXT:    sw a2, 68(sp)
+; ZFINX32-NEXT:    sw a3, 72(sp)
+; ZFINX32-NEXT:    sw a4, 76(sp)
 ; ZFINX32-NEXT:    sw s9, 48(sp)
 ; ZFINX32-NEXT:    sw s10, 52(sp)
 ; ZFINX32-NEXT:    sw s11, 56(sp)
@@ -1088,14 +1115,17 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZFINX32-NEXT:    sw s2, 20(sp)
 ; ZFINX32-NEXT:    sw s3, 24(sp)
 ; ZFINX32-NEXT:    sw s4, 28(sp)
-; ZFINX32-NEXT:    sw t6, 0(sp)
-; ZFINX32-NEXT:    sw t4, 4(sp)
-; ZFINX32-NEXT:    sw t5, 8(sp)
+; ZFINX32-NEXT:    sw t0, 0(sp)
+; ZFINX32-NEXT:    sw t1, 4(sp)
+; ZFINX32-NEXT:    sw t2, 8(sp)
 ; ZFINX32-NEXT:    sw s0, 12(sp)
-; ZFINX32-NEXT:    lw t3, 104(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw t4, 100(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw t5, 96(sp) # 4-byte Folded Reload
-; ZFINX32-NEXT:    lw t6, 92(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    mv a1, a5
+; ZFINX32-NEXT:    mv a2, a6
+; ZFINX32-NEXT:    mv a3, a7
+; ZFINX32-NEXT:    lw a4, 92(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw a5, 96(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw a6, 100(sp) # 4-byte Folded Reload
+; ZFINX32-NEXT:    lw a7, 104(sp) # 4-byte Folded Reload
 ; ZFINX32-NEXT:    call callee_float_32
 ; ZFINX32-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
 ; ZFINX32-NEXT:    lw s0, 152(sp) # 4-byte Folded Reload
@@ -1129,17 +1159,20 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZFINX64-NEXT:    sd s9, 120(sp) # 8-byte Folded Spill
 ; ZFINX64-NEXT:    sd s10, 112(sp) # 8-byte Folded Spill
 ; ZFINX64-NEXT:    sd s11, 104(sp) # 8-byte Folded Spill
-; ZFINX64-NEXT:    lw t0, 208(sp)
-; ZFINX64-NEXT:    sw t0, 100(sp) # 4-byte Folded Spill
-; ZFINX64-NEXT:    lw t0, 216(sp)
-; ZFINX64-NEXT:    sw t0, 96(sp) # 4-byte Folded Spill
-; ZFINX64-NEXT:    lw t0, 224(sp)
-; ZFINX64-NEXT:    sw t0, 92(sp) # 4-byte Folded Spill
-; ZFINX64-NEXT:    lw t0, 232(sp)
-; ZFINX64-NEXT:    sw t0, 88(sp) # 4-byte Folded Spill
-; ZFINX64-NEXT:    lw t6, 240(sp)
-; ZFINX64-NEXT:    lw t4, 248(sp)
-; ZFINX64-NEXT:    lw t5, 256(sp)
+; ZFINX64-NEXT:    sw a7, 100(sp) # 4-byte Folded Spill
+; ZFINX64-NEXT:    sw a6, 96(sp) # 4-byte Folded Spill
+; ZFINX64-NEXT:    sw a5, 92(sp) # 4-byte Folded Spill
+; ZFINX64-NEXT:    sw a4, 88(sp) # 4-byte Folded Spill
+; ZFINX64-NEXT:    mv a7, a3
+; ZFINX64-NEXT:    mv a6, a2
+; ZFINX64-NEXT:    mv a5, a1
+; ZFINX64-NEXT:    lw t3, 208(sp)
+; ZFINX64-NEXT:    lw t4, 216(sp)
+; ZFINX64-NEXT:    lw t5, 224(sp)
+; ZFINX64-NEXT:    lw t6, 232(sp)
+; ZFINX64-NEXT:    lw t0, 240(sp)
+; ZFINX64-NEXT:    lw t1, 248(sp)
+; ZFINX64-NEXT:    lw t2, 256(sp)
 ; ZFINX64-NEXT:    lw s0, 264(sp)
 ; ZFINX64-NEXT:    lw s1, 272(sp)
 ; ZFINX64-NEXT:    lw s2, 280(sp)
@@ -1153,14 +1186,14 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZFINX64-NEXT:    lw s10, 344(sp)
 ; ZFINX64-NEXT:    lw s11, 352(sp)
 ; ZFINX64-NEXT:    lw ra, 360(sp)
-; ZFINX64-NEXT:    lw t0, 368(sp)
-; ZFINX64-NEXT:    lw t1, 376(sp)
-; ZFINX64-NEXT:    lw t2, 384(sp)
-; ZFINX64-NEXT:    lw t3, 392(sp)
-; ZFINX64-NEXT:    sw t0, 64(sp)
-; ZFINX64-NEXT:    sw t1, 68(sp)
-; ZFINX64-NEXT:    sw t2, 72(sp)
-; ZFINX64-NEXT:    sw t3, 76(sp)
+; ZFINX64-NEXT:    lw a1, 368(sp)
+; ZFINX64-NEXT:    lw a2, 376(sp)
+; ZFINX64-NEXT:    lw a3, 384(sp)
+; ZFINX64-NEXT:    lw a4, 392(sp)
+; ZFINX64-NEXT:    sw a1, 64(sp)
+; ZFINX64-NEXT:    sw a2, 68(sp)
+; ZFINX64-NEXT:    sw a3, 72(sp)
+; ZFINX64-NEXT:    sw a4, 76(sp)
 ; ZFINX64-NEXT:    sw s9, 48(sp)
 ; ZFINX64-NEXT:    sw s10, 52(sp)
 ; ZFINX64-NEXT:    sw s11, 56(sp)
@@ -1173,14 +1206,17 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZFINX64-NEXT:    sw s2, 20(sp)
 ; ZFINX64-NEXT:    sw s3, 24(sp)
 ; ZFINX64-NEXT:    sw s4, 28(sp)
-; ZFINX64-NEXT:    sw t6, 0(sp)
-; ZFINX64-NEXT:    sw t4, 4(sp)
-; ZFINX64-NEXT:    sw t5, 8(sp)
+; ZFINX64-NEXT:    sw t0, 0(sp)
+; ZFINX64-NEXT:    sw t1, 4(sp)
+; ZFINX64-NEXT:    sw t2, 8(sp)
 ; ZFINX64-NEXT:    sw s0, 12(sp)
-; ZFINX64-NEXT:    lw t3, 100(sp) # 4-byte Folded Reload
-; ZFINX64-NEXT:    lw t4, 96(sp) # 4-byte Folded Reload
-; ZFINX64-NEXT:    lw t5, 92(sp) # 4-byte Folded Reload
-; ZFINX64-NEXT:    lw t6, 88(sp) # 4-byte Folded Reload
+; ZFINX64-NEXT:    mv a1, a5
+; ZFINX64-NEXT:    mv a2, a6
+; ZFINX64-NEXT:    mv a3, a7
+; ZFINX64-NEXT:    lw a4, 88(sp) # 4-byte Folded Reload
+; ZFINX64-NEXT:    lw a5, 92(sp) # 4-byte Folded Reload
+; ZFINX64-NEXT:    lw a6, 96(sp) # 4-byte Folded Reload
+; ZFINX64-NEXT:    lw a7, 100(sp) # 4-byte Folded Reload
 ; ZFINX64-NEXT:    call callee_float_32
 ; ZFINX64-NEXT:    ld ra, 200(sp) # 8-byte Folded Reload
 ; ZFINX64-NEXT:    ld s0, 192(sp) # 8-byte Folded Reload
@@ -1214,17 +1250,20 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZDINX32-NEXT:    sw s9, 116(sp) # 4-byte Folded Spill
 ; ZDINX32-NEXT:    sw s10, 112(sp) # 4-byte Folded Spill
 ; ZDINX32-NEXT:    sw s11, 108(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    lw t0, 160(sp)
-; ZDINX32-NEXT:    sw t0, 104(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    lw t0, 164(sp)
-; ZDINX32-NEXT:    sw t0, 100(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    lw t0, 168(sp)
-; ZDINX32-NEXT:    sw t0, 96(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    lw t0, 172(sp)
-; ZDINX32-NEXT:    sw t0, 92(sp) # 4-byte Folded Spill
-; ZDINX32-NEXT:    lw t6, 176(sp)
-; ZDINX32-NEXT:    lw t4, 180(sp)
-; ZDINX32-NEXT:    lw t5, 184(sp)
+; ZDINX32-NEXT:    sw a7, 104(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw a6, 100(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw a5, 96(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    sw a4, 92(sp) # 4-byte Folded Spill
+; ZDINX32-NEXT:    mv a7, a3
+; ZDINX32-NEXT:    mv a6, a2
+; ZDINX32-NEXT:    mv a5, a1
+; ZDINX32-NEXT:    lw t3, 160(sp)
+; ZDINX32-NEXT:    lw t4, 164(sp)
+; ZDINX32-NEXT:    lw t5, 168(sp)
+; ZDINX32-NEXT:    lw t6, 172(sp)
+; ZDINX32-NEXT:    lw t0, 176(sp)
+; ZDINX32-NEXT:    lw t1, 180(sp)
+; ZDINX32-NEXT:    lw t2, 184(sp)
 ; ZDINX32-NEXT:    lw s0, 188(sp)
 ; ZDINX32-NEXT:    lw s1, 192(sp)
 ; ZDINX32-NEXT:    lw s2, 196(sp)
@@ -1238,14 +1277,14 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZDINX32-NEXT:    lw s10, 228(sp)
 ; ZDINX32-NEXT:    lw s11, 232(sp)
 ; ZDINX32-NEXT:    lw ra, 236(sp)
-; ZDINX32-NEXT:    lw t0, 240(sp)
-; ZDINX32-NEXT:    lw t1, 244(sp)
-; ZDINX32-NEXT:    lw t2, 248(sp)
-; ZDINX32-NEXT:    lw t3, 252(sp)
-; ZDINX32-NEXT:    sw t0, 64(sp)
-; ZDINX32-NEXT:    sw t1, 68(sp)
-; ZDINX32-NEXT:    sw t2, 72(sp)
-; ZDINX32-NEXT:    sw t3, 76(sp)
+; ZDINX32-NEXT:    lw a1, 240(sp)
+; ZDINX32-NEXT:    lw a2, 244(sp)
+; ZDINX32-NEXT:    lw a3, 248(sp)
+; ZDINX32-NEXT:    lw a4, 252(sp)
+; ZDINX32-NEXT:    sw a1, 64(sp)
+; ZDINX32-NEXT:    sw a2, 68(sp)
+; ZDINX32-NEXT:    sw a3, 72(sp)
+; ZDINX32-NEXT:    sw a4, 76(sp)
 ; ZDINX32-NEXT:    sw s9, 48(sp)
 ; ZDINX32-NEXT:    sw s10, 52(sp)
 ; ZDINX32-NEXT:    sw s11, 56(sp)
@@ -1258,14 +1297,17 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZDINX32-NEXT:    sw s2, 20(sp)
 ; ZDINX32-NEXT:    sw s3, 24(sp)
 ; ZDINX32-NEXT:    sw s4, 28(sp)
-; ZDINX32-NEXT:    sw t6, 0(sp)
-; ZDINX32-NEXT:    sw t4, 4(sp)
-; ZDINX32-NEXT:    sw t5, 8(sp)
+; ZDINX32-NEXT:    sw t0, 0(sp)
+; ZDINX32-NEXT:    sw t1, 4(sp)
+; ZDINX32-NEXT:    sw t2, 8(sp)
 ; ZDINX32-NEXT:    sw s0, 12(sp)
-; ZDINX32-NEXT:    lw t3, 104(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw t4, 100(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw t5, 96(sp) # 4-byte Folded Reload
-; ZDINX32-NEXT:    lw t6, 92(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    mv a1, a5
+; ZDINX32-NEXT:    mv a2, a6
+; ZDINX32-NEXT:    mv a3, a7
+; ZDINX32-NEXT:    lw a4, 92(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw a5, 96(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw a6, 100(sp) # 4-byte Folded Reload
+; ZDINX32-NEXT:    lw a7, 104(sp) # 4-byte Folded Reload
 ; ZDINX32-NEXT:    call callee_float_32
 ; ZDINX32-NEXT:    lw ra, 156(sp) # 4-byte Folded Reload
 ; ZDINX32-NEXT:    lw s0, 152(sp) # 4-byte Folded Reload
@@ -1299,17 +1341,20 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZDINX64-NEXT:    sd s9, 120(sp) # 8-byte Folded Spill
 ; ZDINX64-NEXT:    sd s10, 112(sp) # 8-byte Folded Spill
 ; ZDINX64-NEXT:    sd s11, 104(sp) # 8-byte Folded Spill
-; ZDINX64-NEXT:    lw t0, 208(sp)
-; ZDINX64-NEXT:    sw t0, 100(sp) # 4-byte Folded Spill
-; ZDINX64-NEXT:    lw t0, 216(sp)
-; ZDINX64-NEXT:    sw t0, 96(sp) # 4-byte Folded Spill
-; ZDINX64-NEXT:    lw t0, 224(sp)
-; ZDINX64-NEXT:    sw t0, 92(sp) # 4-byte Folded Spill
-; ZDINX64-NEXT:    lw t0, 232(sp)
-; ZDINX64-NEXT:    sw t0, 88(sp) # 4-byte Folded Spill
-; ZDINX64-NEXT:    lw t6, 240(sp)
-; ZDINX64-NEXT:    lw t4, 248(sp)
-; ZDINX64-NEXT:    lw t5, 256(sp)
+; ZDINX64-NEXT:    sw a7, 100(sp) # 4-byte Folded Spill
+; ZDINX64-NEXT:    sw a6, 96(sp) # 4-byte Folded Spill
+; ZDINX64-NEXT:    sw a5, 92(sp) # 4-byte Folded Spill
+; ZDINX64-NEXT:    sw a4, 88(sp) # 4-byte Folded Spill
+; ZDINX64-NEXT:    mv a7, a3
+; ZDINX64-NEXT:    mv a6, a2
+; ZDINX64-NEXT:    mv a5, a1
+; ZDINX64-NEXT:    lw t3, 208(sp)
+; ZDINX64-NEXT:    lw t4, 216(sp)
+; ZDINX64-NEXT:    lw t5, 224(sp)
+; ZDINX64-NEXT:    lw t6, 232(sp)
+; ZDINX64-NEXT:    lw t0, 240(sp)
+; ZDINX64-NEXT:    lw t1, 248(sp)
+; ZDINX64-NEXT:    lw t2, 256(sp)
 ; ZDINX64-NEXT:    lw s0, 264(sp)
 ; ZDINX64-NEXT:    lw s1, 272(sp)
 ; ZDINX64-NEXT:    lw s2, 280(sp)
@@ -1323,14 +1368,14 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZDINX64-NEXT:    lw s10, 344(sp)
 ; ZDINX64-NEXT:    lw s11, 352(sp)
 ; ZDINX64-NEXT:    lw ra, 360(sp)
-; ZDINX64-NEXT:    lw t0, 368(sp)
-; ZDINX64-NEXT:    lw t1, 376(sp)
-; ZDINX64-NEXT:    lw t2, 384(sp)
-; ZDINX64-NEXT:    lw t3, 392(sp)
-; ZDINX64-NEXT:    sw t0, 64(sp)
-; ZDINX64-NEXT:    sw t1, 68(sp)
-; ZDINX64-NEXT:    sw t2, 72(sp)
-; ZDINX64-NEXT:    sw t3, 76(sp)
+; ZDINX64-NEXT:    lw a1, 368(sp)
+; ZDINX64-NEXT:    lw a2, 376(sp)
+; ZDINX64-NEXT:    lw a3, 384(sp)
+; ZDINX64-NEXT:    lw a4, 392(sp)
+; ZDINX64-NEXT:    sw a1, 64(sp)
+; ZDINX64-NEXT:    sw a2, 68(sp)
+; ZDINX64-NEXT:    sw a3, 72(sp)
+; ZDINX64-NEXT:    sw a4, 76(sp)
 ; ZDINX64-NEXT:    sw s9, 48(sp)
 ; ZDINX64-NEXT:    sw s10, 52(sp)
 ; ZDINX64-NEXT:    sw s11, 56(sp)
@@ -1343,14 +1388,17 @@ define float @caller_float_32(<32 x float> %A) nounwind {
 ; ZDINX64-NEXT:    sw s2, 20(sp)
 ; ZDINX64-NEXT:    sw s3, 24(sp)
 ; ZDINX64-NEXT:    sw s4, 28(sp)
-; ZDINX64-NEXT:    sw t6, 0(sp)
-; ZDINX64-NEXT:    sw t4, 4(sp)
-; ZDINX64-NEXT:    sw t5, 8(sp)
+; ZDINX64-NEXT:    sw t0, 0(sp)
+; ZDINX64-NEXT:    sw t1, 4(sp)
+; ZDINX64-NEXT:    sw t2, 8(sp)
 ; ZDINX64-NEXT:    sw s0, 12(sp)
-; ZDINX64-NEXT:    lw t3, 100(sp) # 4-byte Folded Reload
-; ZDINX64-NEXT:    lw t4, 96(sp) # 4-byte Folded Reload
-; ZDINX64-NEXT:    lw t5, 92(sp) # 4-byte Folded Reload
-; ZDINX64-NEXT:    lw t6, 88(sp) # 4-byte Folded Reload
+; ZDINX64-NEXT:    mv a1, a5
+; ZDINX64-NEXT:    mv a2, a6
+; ZDINX64-NEXT:    mv a3, a7
+; ZDINX64-NEXT:    lw a4, 88(sp) # 4-byte Folded Reload
+; ZDINX64-NEXT:    lw a5, 92(sp) # 4-byte Folded Reload
+; ZDINX64-NEXT:    lw a6, 96(sp) # 4-byte Folded Reload
+; ZDINX64-NEXT:    lw a7, 100(sp) # 4-byte Folded Reload
 ; ZDINX64-NEXT:    call callee_float_32
 ; ZDINX64-NEXT:    ld ra, 200(sp) # 8-byte Folded Reload
 ; ZDINX64-NEXT:    ld s0, 192(sp) # 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/float-arith.ll b/llvm/test/CodeGen/RISCV/float-arith.ll
index bf500d1a2adb39..57b3423da69a6b 100644
--- a/llvm/test/CodeGen/RISCV/float-arith.ll
+++ b/llvm/test/CodeGen/RISCV/float-arith.ll
@@ -195,8 +195,8 @@ define float @fsgnj_s(float %a, float %b) nounwind {
 ; RV32I-LABEL: fsgnj_s:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -204,8 +204,8 @@ define float @fsgnj_s(float %a, float %b) nounwind {
 ; RV64I-LABEL: fsgnj_s:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 524288
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 33
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -284,8 +284,8 @@ define float @fsgnjn_s(float %a, float %b) nounwind {
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    not a0, a0
 ; RV32I-NEXT:    lui a1, 524288
-; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli s0, s0, 1
+; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    srli s0, s0, 1
 ; RV32I-NEXT:    or a0, s0, a0
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -302,8 +302,8 @@ define float @fsgnjn_s(float %a, float %b) nounwind {
 ; RV64I-NEXT:    call __addsf3
 ; RV64I-NEXT:    not a0, a0
 ; RV64I-NEXT:    lui a1, 524288
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli s0, s0, 33
+; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    srli s0, s0, 33
 ; RV64I-NEXT:    or a0, s0, a0
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll b/llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll
index 86f6f079243c26..aaeb1b7c0b1fb1 100644
--- a/llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll
+++ b/llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll
@@ -107,8 +107,8 @@ define float @fcopysign_fneg(float %a, float %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    not a1, a1
 ; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -131,8 +131,8 @@ define float @fcopysign_fneg(float %a, float %b) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    not a1, a1
 ; RV64I-NEXT:    lui a2, 524288
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 33
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll
index 031976b4fa2b21..fc866d71a3a709 100644
--- a/llvm/test/CodeGen/RISCV/float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/float-convert.ll
@@ -629,7 +629,7 @@ define i64 @fcvt_l_s_sat(float %a) nounwind {
 ; RV32IF-NEXT:    fmv.w.x fa5, a0
 ; RV32IF-NEXT:    fle.s s0, fa5, fa0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a3, 524288
 ; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB12_2
 ; RV32IF-NEXT:  # %bb.1: # %start
@@ -637,19 +637,19 @@ define i64 @fcvt_l_s_sat(float %a) nounwind {
 ; RV32IF-NEXT:  .LBB12_2: # %start
 ; RV32IF-NEXT:    lui a1, %hi(.LCPI12_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI12_0)(a1)
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    beqz a3, .LBB12_4
+; RV32IF-NEXT:    flt.s a1, fa5, fs0
+; RV32IF-NEXT:    beqz a1, .LBB12_4
 ; RV32IF-NEXT:  # %bb.3:
-; RV32IF-NEXT:    addi a2, a4, -1
+; RV32IF-NEXT:    addi a2, a3, -1
 ; RV32IF-NEXT:  .LBB12_4: # %start
-; RV32IF-NEXT:    feq.s a1, fs0, fs0
-; RV32IF-NEXT:    neg a4, a1
-; RV32IF-NEXT:    and a1, a4, a2
-; RV32IF-NEXT:    neg a2, s0
-; RV32IF-NEXT:    and a0, a2, a0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    feq.s a3, fs0, fs0
+; RV32IF-NEXT:    neg a4, s0
+; RV32IF-NEXT:    neg a5, a1
+; RV32IF-NEXT:    neg a3, a3
 ; RV32IF-NEXT:    and a0, a4, a0
+; RV32IF-NEXT:    and a1, a3, a2
+; RV32IF-NEXT:    or a0, a5, a0
+; RV32IF-NEXT:    and a0, a3, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -676,7 +676,7 @@ define i64 @fcvt_l_s_sat(float %a) nounwind {
 ; RV32IZFINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    lui a2, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB12_2
 ; RV32IZFINX-NEXT:  # %bb.1: # %start
@@ -684,19 +684,19 @@ define i64 @fcvt_l_s_sat(float %a) nounwind {
 ; RV32IZFINX-NEXT:  .LBB12_2: # %start
 ; RV32IZFINX-NEXT:    lui a1, 389120
 ; RV32IZFINX-NEXT:    addi a1, a1, -1
-; RV32IZFINX-NEXT:    flt.s a3, a1, s0
-; RV32IZFINX-NEXT:    beqz a3, .LBB12_4
+; RV32IZFINX-NEXT:    flt.s a1, a1, s0
+; RV32IZFINX-NEXT:    beqz a1, .LBB12_4
 ; RV32IZFINX-NEXT:  # %bb.3:
-; RV32IZFINX-NEXT:    addi a2, a4, -1
+; RV32IZFINX-NEXT:    addi a2, a3, -1
 ; RV32IZFINX-NEXT:  .LBB12_4: # %start
-; RV32IZFINX-NEXT:    feq.s a1, s0, s0
-; RV32IZFINX-NEXT:    neg a4, a1
-; RV32IZFINX-NEXT:    and a1, a4, a2
-; RV32IZFINX-NEXT:    neg a2, s1
-; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    neg a2, a3
-; RV32IZFINX-NEXT:    or a0, a2, a0
+; RV32IZFINX-NEXT:    feq.s a3, s0, s0
+; RV32IZFINX-NEXT:    neg a4, s1
+; RV32IZFINX-NEXT:    neg a5, a1
+; RV32IZFINX-NEXT:    neg a3, a3
 ; RV32IZFINX-NEXT:    and a0, a4, a0
+; RV32IZFINX-NEXT:    and a1, a3, a2
+; RV32IZFINX-NEXT:    or a0, a5, a0
+; RV32IZFINX-NEXT:    and a0, a3, a0
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -722,40 +722,40 @@ define i64 @fcvt_l_s_sat(float %a) nounwind {
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    lui a1, 913408
 ; RV32I-NEXT:    call __gesf2
-; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    call __fixsfdi
 ; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    call __fixsfdi
+; RV32I-NEXT:    mv s0, a0
 ; RV32I-NEXT:    mv s3, a1
 ; RV32I-NEXT:    lui s5, 524288
-; RV32I-NEXT:    bgez s1, .LBB12_2
+; RV32I-NEXT:    bgez s2, .LBB12_2
 ; RV32I-NEXT:  # %bb.1: # %start
 ; RV32I-NEXT:    lui s3, 524288
 ; RV32I-NEXT:  .LBB12_2: # %start
 ; RV32I-NEXT:    lui a1, 389120
 ; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a0, s1
 ; RV32I-NEXT:    call __gtsf2
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    blez a0, .LBB12_4
 ; RV32I-NEXT:  # %bb.3: # %start
 ; RV32I-NEXT:    addi s3, s5, -1
 ; RV32I-NEXT:  .LBB12_4: # %start
-; RV32I-NEXT:    mv a0, s0
-; RV32I-NEXT:    mv a1, s0
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s1
 ; RV32I-NEXT:    call __unordsf2
 ; RV32I-NEXT:    snez a0, a0
+; RV32I-NEXT:    slti a1, s2, 0
+; RV32I-NEXT:    sgtz a2, s4
 ; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    addi a3, a1, -1
 ; RV32I-NEXT:    and a1, a0, s3
-; RV32I-NEXT:    slti a2, s1, 0
-; RV32I-NEXT:    addi a2, a2, -1
-; RV32I-NEXT:    and a2, a2, s2
-; RV32I-NEXT:    sgtz a3, s4
-; RV32I-NEXT:    neg a3, a3
-; RV32I-NEXT:    or a2, a3, a2
+; RV32I-NEXT:    and a3, a3, s0
+; RV32I-NEXT:    neg a2, a2
+; RV32I-NEXT:    or a2, a2, a3
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -879,10 +879,10 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
 ; RV32IF-NEXT:    lui a2, %hi(.LCPI14_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI14_0)(a2)
 ; RV32IF-NEXT:    and a0, s0, a0
+; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    flt.s a2, fa5, fs0
 ; RV32IF-NEXT:    neg a2, a2
 ; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    or a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -912,11 +912,11 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
 ; RV32IZFINX-NEXT:    call __fixunssfdi
 ; RV32IZFINX-NEXT:    and a0, s1, a0
 ; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
-; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    or a1, a2, a1
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -955,10 +955,10 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind {
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __gtsf2
 ; RV32I-NEXT:    sgtz a0, a0
-; RV32I-NEXT:    neg a1, a0
-; RV32I-NEXT:    or a0, a1, s3
-; RV32I-NEXT:    and a2, s2, s1
-; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    and a1, s2, s1
+; RV32I-NEXT:    neg a2, a0
+; RV32I-NEXT:    or a0, a2, s3
+; RV32I-NEXT:    or a1, a2, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -1417,12 +1417,12 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind {
 ; RV32IF-LABEL: fcvt_w_s_sat_i16:
 ; RV32IF:       # %bb.0: # %start
 ; RV32IF-NEXT:    feq.s a0, fa0, fa0
-; RV32IF-NEXT:    neg a0, a0
 ; RV32IF-NEXT:    lui a1, %hi(.LCPI24_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI24_0)(a1)
 ; RV32IF-NEXT:    lui a1, 815104
 ; RV32IF-NEXT:    fmv.w.x fa4, a1
 ; RV32IF-NEXT:    fmax.s fa4, fa0, fa4
+; RV32IF-NEXT:    neg a0, a0
 ; RV32IF-NEXT:    fmin.s fa5, fa4, fa5
 ; RV32IF-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32IF-NEXT:    and a0, a0, a1
@@ -1431,12 +1431,12 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind {
 ; RV64IF-LABEL: fcvt_w_s_sat_i16:
 ; RV64IF:       # %bb.0: # %start
 ; RV64IF-NEXT:    feq.s a0, fa0, fa0
-; RV64IF-NEXT:    neg a0, a0
 ; RV64IF-NEXT:    lui a1, %hi(.LCPI24_0)
 ; RV64IF-NEXT:    flw fa5, %lo(.LCPI24_0)(a1)
 ; RV64IF-NEXT:    lui a1, 815104
 ; RV64IF-NEXT:    fmv.w.x fa4, a1
 ; RV64IF-NEXT:    fmax.s fa4, fa0, fa4
+; RV64IF-NEXT:    neg a0, a0
 ; RV64IF-NEXT:    fmin.s fa5, fa4, fa5
 ; RV64IF-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64IF-NEXT:    and a0, a0, a1
@@ -1445,10 +1445,10 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind {
 ; RV32IZFINX-LABEL: fcvt_w_s_sat_i16:
 ; RV32IZFINX:       # %bb.0: # %start
 ; RV32IZFINX-NEXT:    feq.s a1, a0, a0
-; RV32IZFINX-NEXT:    neg a1, a1
 ; RV32IZFINX-NEXT:    lui a2, 815104
 ; RV32IZFINX-NEXT:    fmax.s a0, a0, a2
 ; RV32IZFINX-NEXT:    lui a2, 290816
+; RV32IZFINX-NEXT:    neg a1, a1
 ; RV32IZFINX-NEXT:    addi a2, a2, -512
 ; RV32IZFINX-NEXT:    fmin.s a0, a0, a2
 ; RV32IZFINX-NEXT:    fcvt.w.s a0, a0, rtz
@@ -1458,10 +1458,10 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind {
 ; RV64IZFINX-LABEL: fcvt_w_s_sat_i16:
 ; RV64IZFINX:       # %bb.0: # %start
 ; RV64IZFINX-NEXT:    feq.s a1, a0, a0
-; RV64IZFINX-NEXT:    neg a1, a1
 ; RV64IZFINX-NEXT:    lui a2, 815104
 ; RV64IZFINX-NEXT:    fmax.s a0, a0, a2
 ; RV64IZFINX-NEXT:    lui a2, 290816
+; RV64IZFINX-NEXT:    neg a1, a1
 ; RV64IZFINX-NEXT:    addiw a2, a2, -512
 ; RV64IZFINX-NEXT:    fmin.s a0, a0, a2
 ; RV64IZFINX-NEXT:    fcvt.l.s a0, a0, rtz
@@ -1763,11 +1763,11 @@ define signext i8 @fcvt_w_s_sat_i8(float %a) nounwind {
 ; RV32IF-LABEL: fcvt_w_s_sat_i8:
 ; RV32IF:       # %bb.0: # %start
 ; RV32IF-NEXT:    feq.s a0, fa0, fa0
-; RV32IF-NEXT:    neg a0, a0
 ; RV32IF-NEXT:    lui a1, 798720
 ; RV32IF-NEXT:    fmv.w.x fa5, a1
-; RV32IF-NEXT:    fmax.s fa5, fa0, fa5
 ; RV32IF-NEXT:    lui a1, 274400
+; RV32IF-NEXT:    neg a0, a0
+; RV32IF-NEXT:    fmax.s fa5, fa0, fa5
 ; RV32IF-NEXT:    fmv.w.x fa4, a1
 ; RV32IF-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IF-NEXT:    fcvt.w.s a1, fa5, rtz
@@ -1777,11 +1777,11 @@ define signext i8 @fcvt_w_s_sat_i8(float %a) nounwind {
 ; RV64IF-LABEL: fcvt_w_s_sat_i8:
 ; RV64IF:       # %bb.0: # %start
 ; RV64IF-NEXT:    feq.s a0, fa0, fa0
-; RV64IF-NEXT:    neg a0, a0
 ; RV64IF-NEXT:    lui a1, 798720
 ; RV64IF-NEXT:    fmv.w.x fa5, a1
-; RV64IF-NEXT:    fmax.s fa5, fa0, fa5
 ; RV64IF-NEXT:    lui a1, 274400
+; RV64IF-NEXT:    neg a0, a0
+; RV64IF-NEXT:    fmax.s fa5, fa0, fa5
 ; RV64IF-NEXT:    fmv.w.x fa4, a1
 ; RV64IF-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IF-NEXT:    fcvt.l.s a1, fa5, rtz
@@ -1791,8 +1791,8 @@ define signext i8 @fcvt_w_s_sat_i8(float %a) nounwind {
 ; RV32IZFINX-LABEL: fcvt_w_s_sat_i8:
 ; RV32IZFINX:       # %bb.0: # %start
 ; RV32IZFINX-NEXT:    feq.s a1, a0, a0
-; RV32IZFINX-NEXT:    neg a1, a1
 ; RV32IZFINX-NEXT:    lui a2, 798720
+; RV32IZFINX-NEXT:    neg a1, a1
 ; RV32IZFINX-NEXT:    fmax.s a0, a0, a2
 ; RV32IZFINX-NEXT:    lui a2, 274400
 ; RV32IZFINX-NEXT:    fmin.s a0, a0, a2
@@ -1803,8 +1803,8 @@ define signext i8 @fcvt_w_s_sat_i8(float %a) nounwind {
 ; RV64IZFINX-LABEL: fcvt_w_s_sat_i8:
 ; RV64IZFINX:       # %bb.0: # %start
 ; RV64IZFINX-NEXT:    feq.s a1, a0, a0
-; RV64IZFINX-NEXT:    neg a1, a1
 ; RV64IZFINX-NEXT:    lui a2, 798720
+; RV64IZFINX-NEXT:    neg a1, a1
 ; RV64IZFINX-NEXT:    fmax.s a0, a0, a2
 ; RV64IZFINX-NEXT:    lui a2, 274400
 ; RV64IZFINX-NEXT:    fmin.s a0, a0, a2
@@ -1943,8 +1943,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(float %a) nounwind {
 ; RV32IF-LABEL: fcvt_wu_s_sat_i8:
 ; RV32IF:       # %bb.0: # %start
 ; RV32IF-NEXT:    fmv.w.x fa5, zero
-; RV32IF-NEXT:    fmax.s fa5, fa0, fa5
 ; RV32IF-NEXT:    lui a0, 276464
+; RV32IF-NEXT:    fmax.s fa5, fa0, fa5
 ; RV32IF-NEXT:    fmv.w.x fa4, a0
 ; RV32IF-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IF-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -1953,8 +1953,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(float %a) nounwind {
 ; RV64IF-LABEL: fcvt_wu_s_sat_i8:
 ; RV64IF:       # %bb.0: # %start
 ; RV64IF-NEXT:    fmv.w.x fa5, zero
-; RV64IF-NEXT:    fmax.s fa5, fa0, fa5
 ; RV64IF-NEXT:    lui a0, 276464
+; RV64IF-NEXT:    fmax.s fa5, fa0, fa5
 ; RV64IF-NEXT:    fmv.w.x fa4, a0
 ; RV64IF-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IF-NEXT:    fcvt.lu.s a0, fa5, rtz

diff  --git a/llvm/test/CodeGen/RISCV/float-intrinsics.ll b/llvm/test/CodeGen/RISCV/float-intrinsics.ll
index e154f3361a1216..37381aeeb2a0f3 100644
--- a/llvm/test/CodeGen/RISCV/float-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/float-intrinsics.ll
@@ -807,8 +807,8 @@ define float @copysign_f32(float %a, float %b) nounwind {
 ; RV32I-LABEL: copysign_f32:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -816,8 +816,8 @@ define float @copysign_f32(float %a, float %b) nounwind {
 ; RV64I-LABEL: copysign_f32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 524288
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 33
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -1603,54 +1603,54 @@ define i1 @fpclass(float %x) {
 ; RV32I-LABEL: fpclass:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a0, 1
-; RV32I-NEXT:    srli a1, a1, 1
-; RV32I-NEXT:    addi a2, a1, -1
-; RV32I-NEXT:    lui a3, 2048
-; RV32I-NEXT:    addi a3, a3, -1
-; RV32I-NEXT:    sltu a2, a2, a3
+; RV32I-NEXT:    lui a2, 2048
 ; RV32I-NEXT:    slti a0, a0, 0
-; RV32I-NEXT:    and a2, a2, a0
-; RV32I-NEXT:    seqz a3, a1
-; RV32I-NEXT:    lui a4, 522240
-; RV32I-NEXT:    xor a5, a1, a4
+; RV32I-NEXT:    lui a3, 522240
+; RV32I-NEXT:    lui a4, 1046528
+; RV32I-NEXT:    srli a1, a1, 1
+; RV32I-NEXT:    addi a2, a2, -1
+; RV32I-NEXT:    addi a5, a1, -1
+; RV32I-NEXT:    sltu a2, a5, a2
+; RV32I-NEXT:    xor a5, a1, a3
+; RV32I-NEXT:    slt a3, a3, a1
+; RV32I-NEXT:    add a4, a1, a4
+; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    seqz a5, a5
-; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    or a2, a3, a2
-; RV32I-NEXT:    slt a3, a4, a1
-; RV32I-NEXT:    or a2, a2, a3
-; RV32I-NEXT:    lui a3, 1046528
-; RV32I-NEXT:    add a1, a1, a3
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    sltiu a1, a1, 127
-; RV32I-NEXT:    and a0, a1, a0
-; RV32I-NEXT:    or a0, a2, a0
+; RV32I-NEXT:    srli a4, a4, 24
+; RV32I-NEXT:    and a2, a2, a0
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    sltiu a4, a4, 127
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    and a0, a4, a0
+; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: fpclass:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sext.w a1, a0
 ; RV64I-NEXT:    slli a0, a0, 33
+; RV64I-NEXT:    lui a2, 2048
+; RV64I-NEXT:    lui a3, 522240
+; RV64I-NEXT:    lui a4, 1046528
 ; RV64I-NEXT:    srli a0, a0, 33
-; RV64I-NEXT:    addi a2, a0, -1
-; RV64I-NEXT:    lui a3, 2048
-; RV64I-NEXT:    addiw a3, a3, -1
-; RV64I-NEXT:    sltu a2, a2, a3
+; RV64I-NEXT:    addiw a2, a2, -1
 ; RV64I-NEXT:    slti a1, a1, 0
-; RV64I-NEXT:    and a2, a2, a1
-; RV64I-NEXT:    seqz a3, a0
-; RV64I-NEXT:    lui a4, 522240
-; RV64I-NEXT:    xor a5, a0, a4
+; RV64I-NEXT:    addi a5, a0, -1
+; RV64I-NEXT:    sltu a2, a5, a2
+; RV64I-NEXT:    xor a5, a0, a3
+; RV64I-NEXT:    slt a3, a3, a0
+; RV64I-NEXT:    add a4, a0, a4
+; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    seqz a5, a5
-; RV64I-NEXT:    or a3, a3, a5
-; RV64I-NEXT:    or a2, a3, a2
-; RV64I-NEXT:    slt a3, a4, a0
-; RV64I-NEXT:    or a2, a2, a3
-; RV64I-NEXT:    lui a3, 1046528
-; RV64I-NEXT:    add a0, a0, a3
-; RV64I-NEXT:    srliw a0, a0, 24
-; RV64I-NEXT:    sltiu a0, a0, 127
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    srliw a4, a4, 24
+; RV64I-NEXT:    and a2, a2, a1
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    sltiu a4, a4, 127
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    and a1, a4, a1
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
   %cmp = call i1 @llvm.is.fpclass.f32(float %x, i32 639)
   ret i1 %cmp
@@ -1732,8 +1732,8 @@ define i1 @isqnan_fpclass(float %x) {
 ; RV32I-LABEL: isqnan_fpclass:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 1
-; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    lui a1, 523264
+; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    slt a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -1741,8 +1741,8 @@ define i1 @isqnan_fpclass(float %x) {
 ; RV64I-LABEL: isqnan_fpclass:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 33
-; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    lui a1, 523264
+; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    addiw a1, a1, -1
 ; RV64I-NEXT:    slt a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -1782,10 +1782,10 @@ define i1 @issnan_fpclass(float %x) {
 ; RV32I-LABEL: issnan_fpclass:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 1
-; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    lui a1, 523264
-; RV32I-NEXT:    slt a1, a0, a1
 ; RV32I-NEXT:    lui a2, 522240
+; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    slt a1, a0, a1
 ; RV32I-NEXT:    slt a0, a2, a0
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -1793,10 +1793,10 @@ define i1 @issnan_fpclass(float %x) {
 ; RV64I-LABEL: issnan_fpclass:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 33
-; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    lui a1, 523264
-; RV64I-NEXT:    slt a1, a0, a1
 ; RV64I-NEXT:    lui a2, 522240
+; RV64I-NEXT:    srli a0, a0, 33
+; RV64I-NEXT:    slt a1, a0, a1
 ; RV64I-NEXT:    slt a0, a2, a0
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -2068,8 +2068,8 @@ define i1 @isnegfinite_fpclass(float %x) {
 ; RV32I-LABEL: isnegfinite_fpclass:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a0, 1
-; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    lui a2, 522240
+; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    slt a1, a1, a2
 ; RV32I-NEXT:    slti a0, a0, 0
 ; RV32I-NEXT:    and a0, a1, a0
@@ -2079,8 +2079,8 @@ define i1 @isnegfinite_fpclass(float %x) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sext.w a1, a0
 ; RV64I-NEXT:    slli a0, a0, 33
-; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    lui a2, 522240
+; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    slt a0, a0, a2
 ; RV64I-NEXT:    slti a1, a1, 0
 ; RV64I-NEXT:    and a0, a0, a1
@@ -2121,8 +2121,8 @@ define i1 @isnotfinite_fpclass(float %x) {
 ; RV32I-LABEL: isnotfinite_fpclass:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 1
-; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    lui a1, 522240
+; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    slt a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -2130,8 +2130,8 @@ define i1 @isnotfinite_fpclass(float %x) {
 ; RV64I-LABEL: isnotfinite_fpclass:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 33
-; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    lui a1, 522240
+; RV64I-NEXT:    srli a0, a0, 33
 ; RV64I-NEXT:    addiw a1, a1, -1
 ; RV64I-NEXT:    slt a0, a1, a0
 ; RV64I-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
index 198b18c75272a9..809cc31abe612f 100644
--- a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll
@@ -54,7 +54,7 @@ define i64 @test_floor_si64(float %x) nounwind {
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a3, 524288
 ; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB1_4
 ; RV32IF-NEXT:  # %bb.3:
@@ -62,19 +62,19 @@ define i64 @test_floor_si64(float %x) nounwind {
 ; RV32IF-NEXT:  .LBB1_4:
 ; RV32IF-NEXT:    lui a1, %hi(.LCPI1_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI1_0)(a1)
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    beqz a3, .LBB1_6
+; RV32IF-NEXT:    flt.s a1, fa5, fs0
+; RV32IF-NEXT:    beqz a1, .LBB1_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a2, a4, -1
+; RV32IF-NEXT:    addi a2, a3, -1
 ; RV32IF-NEXT:  .LBB1_6:
-; RV32IF-NEXT:    feq.s a1, fs0, fs0
-; RV32IF-NEXT:    neg a4, a1
-; RV32IF-NEXT:    and a1, a4, a2
-; RV32IF-NEXT:    neg a2, s0
-; RV32IF-NEXT:    and a0, a2, a0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    feq.s a3, fs0, fs0
+; RV32IF-NEXT:    neg a4, s0
+; RV32IF-NEXT:    neg a5, a1
+; RV32IF-NEXT:    neg a3, a3
 ; RV32IF-NEXT:    and a0, a4, a0
+; RV32IF-NEXT:    and a1, a3, a2
+; RV32IF-NEXT:    or a0, a5, a0
+; RV32IF-NEXT:    and a0, a3, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -110,7 +110,7 @@ define i64 @test_floor_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    lui a2, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB1_4
 ; RV32IZFINX-NEXT:  # %bb.3:
@@ -118,19 +118,19 @@ define i64 @test_floor_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:  .LBB1_4:
 ; RV32IZFINX-NEXT:    lui a1, 389120
 ; RV32IZFINX-NEXT:    addi a1, a1, -1
-; RV32IZFINX-NEXT:    flt.s a3, a1, s0
-; RV32IZFINX-NEXT:    beqz a3, .LBB1_6
+; RV32IZFINX-NEXT:    flt.s a1, a1, s0
+; RV32IZFINX-NEXT:    beqz a1, .LBB1_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a2, a4, -1
+; RV32IZFINX-NEXT:    addi a2, a3, -1
 ; RV32IZFINX-NEXT:  .LBB1_6:
-; RV32IZFINX-NEXT:    feq.s a1, s0, s0
-; RV32IZFINX-NEXT:    neg a4, a1
-; RV32IZFINX-NEXT:    and a1, a4, a2
-; RV32IZFINX-NEXT:    neg a2, s1
-; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    neg a2, a3
-; RV32IZFINX-NEXT:    or a0, a2, a0
+; RV32IZFINX-NEXT:    feq.s a3, s0, s0
+; RV32IZFINX-NEXT:    neg a4, s1
+; RV32IZFINX-NEXT:    neg a5, a1
+; RV32IZFINX-NEXT:    neg a3, a3
 ; RV32IZFINX-NEXT:    and a0, a4, a0
+; RV32IZFINX-NEXT:    and a1, a3, a2
+; RV32IZFINX-NEXT:    or a0, a5, a0
+; RV32IZFINX-NEXT:    and a0, a3, a0
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -199,10 +199,10 @@ define i64 @test_floor_ui64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a2, %hi(.LCPI3_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI3_0)(a2)
 ; RV32IF-NEXT:    and a0, s0, a0
+; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    flt.s a2, fa5, fs0
 ; RV32IF-NEXT:    neg a2, a2
 ; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    or a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -241,11 +241,11 @@ define i64 @test_floor_ui64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    call __fixunssfdi
 ; RV32IZFINX-NEXT:    and a0, s1, a0
 ; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
-; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    or a1, a2, a1
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -312,7 +312,7 @@ define i64 @test_ceil_si64(float %x) nounwind {
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a3, 524288
 ; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB5_4
 ; RV32IF-NEXT:  # %bb.3:
@@ -320,19 +320,19 @@ define i64 @test_ceil_si64(float %x) nounwind {
 ; RV32IF-NEXT:  .LBB5_4:
 ; RV32IF-NEXT:    lui a1, %hi(.LCPI5_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI5_0)(a1)
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    beqz a3, .LBB5_6
+; RV32IF-NEXT:    flt.s a1, fa5, fs0
+; RV32IF-NEXT:    beqz a1, .LBB5_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a2, a4, -1
+; RV32IF-NEXT:    addi a2, a3, -1
 ; RV32IF-NEXT:  .LBB5_6:
-; RV32IF-NEXT:    feq.s a1, fs0, fs0
-; RV32IF-NEXT:    neg a4, a1
-; RV32IF-NEXT:    and a1, a4, a2
-; RV32IF-NEXT:    neg a2, s0
-; RV32IF-NEXT:    and a0, a2, a0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    feq.s a3, fs0, fs0
+; RV32IF-NEXT:    neg a4, s0
+; RV32IF-NEXT:    neg a5, a1
+; RV32IF-NEXT:    neg a3, a3
 ; RV32IF-NEXT:    and a0, a4, a0
+; RV32IF-NEXT:    and a1, a3, a2
+; RV32IF-NEXT:    or a0, a5, a0
+; RV32IF-NEXT:    and a0, a3, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -368,7 +368,7 @@ define i64 @test_ceil_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    lui a2, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB5_4
 ; RV32IZFINX-NEXT:  # %bb.3:
@@ -376,19 +376,19 @@ define i64 @test_ceil_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:  .LBB5_4:
 ; RV32IZFINX-NEXT:    lui a1, 389120
 ; RV32IZFINX-NEXT:    addi a1, a1, -1
-; RV32IZFINX-NEXT:    flt.s a3, a1, s0
-; RV32IZFINX-NEXT:    beqz a3, .LBB5_6
+; RV32IZFINX-NEXT:    flt.s a1, a1, s0
+; RV32IZFINX-NEXT:    beqz a1, .LBB5_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a2, a4, -1
+; RV32IZFINX-NEXT:    addi a2, a3, -1
 ; RV32IZFINX-NEXT:  .LBB5_6:
-; RV32IZFINX-NEXT:    feq.s a1, s0, s0
-; RV32IZFINX-NEXT:    neg a4, a1
-; RV32IZFINX-NEXT:    and a1, a4, a2
-; RV32IZFINX-NEXT:    neg a2, s1
-; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    neg a2, a3
-; RV32IZFINX-NEXT:    or a0, a2, a0
+; RV32IZFINX-NEXT:    feq.s a3, s0, s0
+; RV32IZFINX-NEXT:    neg a4, s1
+; RV32IZFINX-NEXT:    neg a5, a1
+; RV32IZFINX-NEXT:    neg a3, a3
 ; RV32IZFINX-NEXT:    and a0, a4, a0
+; RV32IZFINX-NEXT:    and a1, a3, a2
+; RV32IZFINX-NEXT:    or a0, a5, a0
+; RV32IZFINX-NEXT:    and a0, a3, a0
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -457,10 +457,10 @@ define i64 @test_ceil_ui64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a2, %hi(.LCPI7_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI7_0)(a2)
 ; RV32IF-NEXT:    and a0, s0, a0
+; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    flt.s a2, fa5, fs0
 ; RV32IF-NEXT:    neg a2, a2
 ; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    or a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -499,11 +499,11 @@ define i64 @test_ceil_ui64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    call __fixunssfdi
 ; RV32IZFINX-NEXT:    and a0, s1, a0
 ; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
-; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    or a1, a2, a1
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -570,7 +570,7 @@ define i64 @test_trunc_si64(float %x) nounwind {
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a3, 524288
 ; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB9_4
 ; RV32IF-NEXT:  # %bb.3:
@@ -578,19 +578,19 @@ define i64 @test_trunc_si64(float %x) nounwind {
 ; RV32IF-NEXT:  .LBB9_4:
 ; RV32IF-NEXT:    lui a1, %hi(.LCPI9_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI9_0)(a1)
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    beqz a3, .LBB9_6
+; RV32IF-NEXT:    flt.s a1, fa5, fs0
+; RV32IF-NEXT:    beqz a1, .LBB9_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a2, a4, -1
+; RV32IF-NEXT:    addi a2, a3, -1
 ; RV32IF-NEXT:  .LBB9_6:
-; RV32IF-NEXT:    feq.s a1, fs0, fs0
-; RV32IF-NEXT:    neg a4, a1
-; RV32IF-NEXT:    and a1, a4, a2
-; RV32IF-NEXT:    neg a2, s0
-; RV32IF-NEXT:    and a0, a2, a0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    feq.s a3, fs0, fs0
+; RV32IF-NEXT:    neg a4, s0
+; RV32IF-NEXT:    neg a5, a1
+; RV32IF-NEXT:    neg a3, a3
 ; RV32IF-NEXT:    and a0, a4, a0
+; RV32IF-NEXT:    and a1, a3, a2
+; RV32IF-NEXT:    or a0, a5, a0
+; RV32IF-NEXT:    and a0, a3, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -626,7 +626,7 @@ define i64 @test_trunc_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    lui a2, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB9_4
 ; RV32IZFINX-NEXT:  # %bb.3:
@@ -634,19 +634,19 @@ define i64 @test_trunc_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:  .LBB9_4:
 ; RV32IZFINX-NEXT:    lui a1, 389120
 ; RV32IZFINX-NEXT:    addi a1, a1, -1
-; RV32IZFINX-NEXT:    flt.s a3, a1, s0
-; RV32IZFINX-NEXT:    beqz a3, .LBB9_6
+; RV32IZFINX-NEXT:    flt.s a1, a1, s0
+; RV32IZFINX-NEXT:    beqz a1, .LBB9_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a2, a4, -1
+; RV32IZFINX-NEXT:    addi a2, a3, -1
 ; RV32IZFINX-NEXT:  .LBB9_6:
-; RV32IZFINX-NEXT:    feq.s a1, s0, s0
-; RV32IZFINX-NEXT:    neg a4, a1
-; RV32IZFINX-NEXT:    and a1, a4, a2
-; RV32IZFINX-NEXT:    neg a2, s1
-; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    neg a2, a3
-; RV32IZFINX-NEXT:    or a0, a2, a0
+; RV32IZFINX-NEXT:    feq.s a3, s0, s0
+; RV32IZFINX-NEXT:    neg a4, s1
+; RV32IZFINX-NEXT:    neg a5, a1
+; RV32IZFINX-NEXT:    neg a3, a3
 ; RV32IZFINX-NEXT:    and a0, a4, a0
+; RV32IZFINX-NEXT:    and a1, a3, a2
+; RV32IZFINX-NEXT:    or a0, a5, a0
+; RV32IZFINX-NEXT:    and a0, a3, a0
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -715,10 +715,10 @@ define i64 @test_trunc_ui64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a2, %hi(.LCPI11_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI11_0)(a2)
 ; RV32IF-NEXT:    and a0, s0, a0
+; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    flt.s a2, fa5, fs0
 ; RV32IF-NEXT:    neg a2, a2
 ; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    or a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -757,11 +757,11 @@ define i64 @test_trunc_ui64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    call __fixunssfdi
 ; RV32IZFINX-NEXT:    and a0, s1, a0
 ; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
-; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    or a1, a2, a1
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -828,7 +828,7 @@ define i64 @test_round_si64(float %x) nounwind {
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a3, 524288
 ; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB13_4
 ; RV32IF-NEXT:  # %bb.3:
@@ -836,19 +836,19 @@ define i64 @test_round_si64(float %x) nounwind {
 ; RV32IF-NEXT:  .LBB13_4:
 ; RV32IF-NEXT:    lui a1, %hi(.LCPI13_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI13_0)(a1)
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    beqz a3, .LBB13_6
+; RV32IF-NEXT:    flt.s a1, fa5, fs0
+; RV32IF-NEXT:    beqz a1, .LBB13_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a2, a4, -1
+; RV32IF-NEXT:    addi a2, a3, -1
 ; RV32IF-NEXT:  .LBB13_6:
-; RV32IF-NEXT:    feq.s a1, fs0, fs0
-; RV32IF-NEXT:    neg a4, a1
-; RV32IF-NEXT:    and a1, a4, a2
-; RV32IF-NEXT:    neg a2, s0
-; RV32IF-NEXT:    and a0, a2, a0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    feq.s a3, fs0, fs0
+; RV32IF-NEXT:    neg a4, s0
+; RV32IF-NEXT:    neg a5, a1
+; RV32IF-NEXT:    neg a3, a3
 ; RV32IF-NEXT:    and a0, a4, a0
+; RV32IF-NEXT:    and a1, a3, a2
+; RV32IF-NEXT:    or a0, a5, a0
+; RV32IF-NEXT:    and a0, a3, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -884,7 +884,7 @@ define i64 @test_round_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    lui a2, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB13_4
 ; RV32IZFINX-NEXT:  # %bb.3:
@@ -892,19 +892,19 @@ define i64 @test_round_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:  .LBB13_4:
 ; RV32IZFINX-NEXT:    lui a1, 389120
 ; RV32IZFINX-NEXT:    addi a1, a1, -1
-; RV32IZFINX-NEXT:    flt.s a3, a1, s0
-; RV32IZFINX-NEXT:    beqz a3, .LBB13_6
+; RV32IZFINX-NEXT:    flt.s a1, a1, s0
+; RV32IZFINX-NEXT:    beqz a1, .LBB13_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a2, a4, -1
+; RV32IZFINX-NEXT:    addi a2, a3, -1
 ; RV32IZFINX-NEXT:  .LBB13_6:
-; RV32IZFINX-NEXT:    feq.s a1, s0, s0
-; RV32IZFINX-NEXT:    neg a4, a1
-; RV32IZFINX-NEXT:    and a1, a4, a2
-; RV32IZFINX-NEXT:    neg a2, s1
-; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    neg a2, a3
-; RV32IZFINX-NEXT:    or a0, a2, a0
+; RV32IZFINX-NEXT:    feq.s a3, s0, s0
+; RV32IZFINX-NEXT:    neg a4, s1
+; RV32IZFINX-NEXT:    neg a5, a1
+; RV32IZFINX-NEXT:    neg a3, a3
 ; RV32IZFINX-NEXT:    and a0, a4, a0
+; RV32IZFINX-NEXT:    and a1, a3, a2
+; RV32IZFINX-NEXT:    or a0, a5, a0
+; RV32IZFINX-NEXT:    and a0, a3, a0
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -973,10 +973,10 @@ define i64 @test_round_ui64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a2, %hi(.LCPI15_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI15_0)(a2)
 ; RV32IF-NEXT:    and a0, s0, a0
+; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    flt.s a2, fa5, fs0
 ; RV32IF-NEXT:    neg a2, a2
 ; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    or a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1015,11 +1015,11 @@ define i64 @test_round_ui64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    call __fixunssfdi
 ; RV32IZFINX-NEXT:    and a0, s1, a0
 ; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
-; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    or a1, a2, a1
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1086,7 +1086,7 @@ define i64 @test_roundeven_si64(float %x) nounwind {
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a3, 524288
 ; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB17_4
 ; RV32IF-NEXT:  # %bb.3:
@@ -1094,19 +1094,19 @@ define i64 @test_roundeven_si64(float %x) nounwind {
 ; RV32IF-NEXT:  .LBB17_4:
 ; RV32IF-NEXT:    lui a1, %hi(.LCPI17_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI17_0)(a1)
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    beqz a3, .LBB17_6
+; RV32IF-NEXT:    flt.s a1, fa5, fs0
+; RV32IF-NEXT:    beqz a1, .LBB17_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a2, a4, -1
+; RV32IF-NEXT:    addi a2, a3, -1
 ; RV32IF-NEXT:  .LBB17_6:
-; RV32IF-NEXT:    feq.s a1, fs0, fs0
-; RV32IF-NEXT:    neg a4, a1
-; RV32IF-NEXT:    and a1, a4, a2
-; RV32IF-NEXT:    neg a2, s0
-; RV32IF-NEXT:    and a0, a2, a0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    feq.s a3, fs0, fs0
+; RV32IF-NEXT:    neg a4, s0
+; RV32IF-NEXT:    neg a5, a1
+; RV32IF-NEXT:    neg a3, a3
 ; RV32IF-NEXT:    and a0, a4, a0
+; RV32IF-NEXT:    and a1, a3, a2
+; RV32IF-NEXT:    or a0, a5, a0
+; RV32IF-NEXT:    and a0, a3, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -1142,7 +1142,7 @@ define i64 @test_roundeven_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    lui a2, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB17_4
 ; RV32IZFINX-NEXT:  # %bb.3:
@@ -1150,19 +1150,19 @@ define i64 @test_roundeven_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:  .LBB17_4:
 ; RV32IZFINX-NEXT:    lui a1, 389120
 ; RV32IZFINX-NEXT:    addi a1, a1, -1
-; RV32IZFINX-NEXT:    flt.s a3, a1, s0
-; RV32IZFINX-NEXT:    beqz a3, .LBB17_6
+; RV32IZFINX-NEXT:    flt.s a1, a1, s0
+; RV32IZFINX-NEXT:    beqz a1, .LBB17_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a2, a4, -1
+; RV32IZFINX-NEXT:    addi a2, a3, -1
 ; RV32IZFINX-NEXT:  .LBB17_6:
-; RV32IZFINX-NEXT:    feq.s a1, s0, s0
-; RV32IZFINX-NEXT:    neg a4, a1
-; RV32IZFINX-NEXT:    and a1, a4, a2
-; RV32IZFINX-NEXT:    neg a2, s1
-; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    neg a2, a3
-; RV32IZFINX-NEXT:    or a0, a2, a0
+; RV32IZFINX-NEXT:    feq.s a3, s0, s0
+; RV32IZFINX-NEXT:    neg a4, s1
+; RV32IZFINX-NEXT:    neg a5, a1
+; RV32IZFINX-NEXT:    neg a3, a3
 ; RV32IZFINX-NEXT:    and a0, a4, a0
+; RV32IZFINX-NEXT:    and a1, a3, a2
+; RV32IZFINX-NEXT:    or a0, a5, a0
+; RV32IZFINX-NEXT:    and a0, a3, a0
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1231,10 +1231,10 @@ define i64 @test_roundeven_ui64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a2, %hi(.LCPI19_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI19_0)(a2)
 ; RV32IF-NEXT:    and a0, s0, a0
+; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    flt.s a2, fa5, fs0
 ; RV32IF-NEXT:    neg a2, a2
 ; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    or a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1273,11 +1273,11 @@ define i64 @test_roundeven_ui64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    call __fixunssfdi
 ; RV32IZFINX-NEXT:    and a0, s1, a0
 ; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
-; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    or a1, a2, a1
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1344,7 +1344,7 @@ define i64 @test_rint_si64(float %x) nounwind {
 ; RV32IF-NEXT:    fle.s s0, fa5, fs0
 ; RV32IF-NEXT:    fmv.s fa0, fs0
 ; RV32IF-NEXT:    call __fixsfdi
-; RV32IF-NEXT:    lui a4, 524288
+; RV32IF-NEXT:    lui a3, 524288
 ; RV32IF-NEXT:    lui a2, 524288
 ; RV32IF-NEXT:    beqz s0, .LBB21_4
 ; RV32IF-NEXT:  # %bb.3:
@@ -1352,19 +1352,19 @@ define i64 @test_rint_si64(float %x) nounwind {
 ; RV32IF-NEXT:  .LBB21_4:
 ; RV32IF-NEXT:    lui a1, %hi(.LCPI21_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI21_0)(a1)
-; RV32IF-NEXT:    flt.s a3, fa5, fs0
-; RV32IF-NEXT:    beqz a3, .LBB21_6
+; RV32IF-NEXT:    flt.s a1, fa5, fs0
+; RV32IF-NEXT:    beqz a1, .LBB21_6
 ; RV32IF-NEXT:  # %bb.5:
-; RV32IF-NEXT:    addi a2, a4, -1
+; RV32IF-NEXT:    addi a2, a3, -1
 ; RV32IF-NEXT:  .LBB21_6:
-; RV32IF-NEXT:    feq.s a1, fs0, fs0
-; RV32IF-NEXT:    neg a4, a1
-; RV32IF-NEXT:    and a1, a4, a2
-; RV32IF-NEXT:    neg a2, s0
-; RV32IF-NEXT:    and a0, a2, a0
-; RV32IF-NEXT:    neg a2, a3
-; RV32IF-NEXT:    or a0, a2, a0
+; RV32IF-NEXT:    feq.s a3, fs0, fs0
+; RV32IF-NEXT:    neg a4, s0
+; RV32IF-NEXT:    neg a5, a1
+; RV32IF-NEXT:    neg a3, a3
 ; RV32IF-NEXT:    and a0, a4, a0
+; RV32IF-NEXT:    and a1, a3, a2
+; RV32IF-NEXT:    or a0, a5, a0
+; RV32IF-NEXT:    and a0, a3, a0
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -1400,7 +1400,7 @@ define i64 @test_rint_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZFINX-NEXT:    mv a0, s0
 ; RV32IZFINX-NEXT:    call __fixsfdi
-; RV32IZFINX-NEXT:    lui a4, 524288
+; RV32IZFINX-NEXT:    lui a3, 524288
 ; RV32IZFINX-NEXT:    lui a2, 524288
 ; RV32IZFINX-NEXT:    beqz s1, .LBB21_4
 ; RV32IZFINX-NEXT:  # %bb.3:
@@ -1408,19 +1408,19 @@ define i64 @test_rint_si64(float %x) nounwind {
 ; RV32IZFINX-NEXT:  .LBB21_4:
 ; RV32IZFINX-NEXT:    lui a1, 389120
 ; RV32IZFINX-NEXT:    addi a1, a1, -1
-; RV32IZFINX-NEXT:    flt.s a3, a1, s0
-; RV32IZFINX-NEXT:    beqz a3, .LBB21_6
+; RV32IZFINX-NEXT:    flt.s a1, a1, s0
+; RV32IZFINX-NEXT:    beqz a1, .LBB21_6
 ; RV32IZFINX-NEXT:  # %bb.5:
-; RV32IZFINX-NEXT:    addi a2, a4, -1
+; RV32IZFINX-NEXT:    addi a2, a3, -1
 ; RV32IZFINX-NEXT:  .LBB21_6:
-; RV32IZFINX-NEXT:    feq.s a1, s0, s0
-; RV32IZFINX-NEXT:    neg a4, a1
-; RV32IZFINX-NEXT:    and a1, a4, a2
-; RV32IZFINX-NEXT:    neg a2, s1
-; RV32IZFINX-NEXT:    and a0, a2, a0
-; RV32IZFINX-NEXT:    neg a2, a3
-; RV32IZFINX-NEXT:    or a0, a2, a0
+; RV32IZFINX-NEXT:    feq.s a3, s0, s0
+; RV32IZFINX-NEXT:    neg a4, s1
+; RV32IZFINX-NEXT:    neg a5, a1
+; RV32IZFINX-NEXT:    neg a3, a3
 ; RV32IZFINX-NEXT:    and a0, a4, a0
+; RV32IZFINX-NEXT:    and a1, a3, a2
+; RV32IZFINX-NEXT:    or a0, a5, a0
+; RV32IZFINX-NEXT:    and a0, a3, a0
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1489,10 +1489,10 @@ define i64 @test_rint_ui64(float %x) nounwind {
 ; RV32IF-NEXT:    lui a2, %hi(.LCPI23_0)
 ; RV32IF-NEXT:    flw fa5, %lo(.LCPI23_0)(a2)
 ; RV32IF-NEXT:    and a0, s0, a0
+; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    flt.s a2, fa5, fs0
 ; RV32IF-NEXT:    neg a2, a2
 ; RV32IF-NEXT:    or a0, a2, a0
-; RV32IF-NEXT:    and a1, s0, a1
 ; RV32IF-NEXT:    or a1, a2, a1
 ; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1531,11 +1531,11 @@ define i64 @test_rint_ui64(float %x) nounwind {
 ; RV32IZFINX-NEXT:    call __fixunssfdi
 ; RV32IZFINX-NEXT:    and a0, s1, a0
 ; RV32IZFINX-NEXT:    lui a2, 391168
+; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    addi a2, a2, -1
 ; RV32IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZFINX-NEXT:    neg a2, a2
 ; RV32IZFINX-NEXT:    or a0, a2, a0
-; RV32IZFINX-NEXT:    and a1, s1, a1
 ; RV32IZFINX-NEXT:    or a1, a2, a1
 ; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
index a204b928304123..b8dc7804c49082 100644
--- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
+++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
@@ -929,19 +929,19 @@ define i64 @fold_addi_from_
diff erent_bb(i64 %k, i64 %n, ptr %a) nounwind {
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call f
-; RV32I-NEXT:    lw a0, 12(s7)
-; RV32I-NEXT:    lw a1, 8(s7)
-; RV32I-NEXT:    add a0, a0, s4
-; RV32I-NEXT:    add s3, a1, s3
-; RV32I-NEXT:    sltu s4, s3, a1
+; RV32I-NEXT:    lw a0, 8(s7)
+; RV32I-NEXT:    lw a1, 12(s7)
 ; RV32I-NEXT:    addi s5, s5, 1
-; RV32I-NEXT:    seqz a1, s5
-; RV32I-NEXT:    add s6, s6, a1
-; RV32I-NEXT:    xor a1, s5, s2
-; RV32I-NEXT:    xor a2, s6, s1
-; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    add s4, a0, s4
-; RV32I-NEXT:    bnez a1, .LBB20_5
+; RV32I-NEXT:    seqz a2, s5
+; RV32I-NEXT:    add s6, s6, a2
+; RV32I-NEXT:    xor a2, s5, s2
+; RV32I-NEXT:    add a1, a1, s4
+; RV32I-NEXT:    xor a3, s6, s1
+; RV32I-NEXT:    or a2, a2, a3
+; RV32I-NEXT:    add s3, a0, s3
+; RV32I-NEXT:    sltu s4, s3, a0
+; RV32I-NEXT:    add s4, a1, s4
+; RV32I-NEXT:    bnez a2, .LBB20_5
 ; RV32I-NEXT:  .LBB20_6: # %for.cond.cleanup
 ; RV32I-NEXT:    mv a0, s3
 ; RV32I-NEXT:    mv a1, s4
@@ -994,19 +994,19 @@ define i64 @fold_addi_from_
diff erent_bb(i64 %k, i64 %n, ptr %a) nounwind {
 ; RV32I-MEDIUM-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32I-MEDIUM-NEXT:    mv a0, s0
 ; RV32I-MEDIUM-NEXT:    call f
-; RV32I-MEDIUM-NEXT:    lw a0, 12(s7)
-; RV32I-MEDIUM-NEXT:    lw a1, 8(s7)
-; RV32I-MEDIUM-NEXT:    add a0, a0, s4
-; RV32I-MEDIUM-NEXT:    add s3, a1, s3
-; RV32I-MEDIUM-NEXT:    sltu s4, s3, a1
+; RV32I-MEDIUM-NEXT:    lw a0, 8(s7)
+; RV32I-MEDIUM-NEXT:    lw a1, 12(s7)
 ; RV32I-MEDIUM-NEXT:    addi s5, s5, 1
-; RV32I-MEDIUM-NEXT:    seqz a1, s5
-; RV32I-MEDIUM-NEXT:    add s6, s6, a1
-; RV32I-MEDIUM-NEXT:    xor a1, s5, s2
-; RV32I-MEDIUM-NEXT:    xor a2, s6, s1
-; RV32I-MEDIUM-NEXT:    or a1, a1, a2
-; RV32I-MEDIUM-NEXT:    add s4, a0, s4
-; RV32I-MEDIUM-NEXT:    bnez a1, .LBB20_5
+; RV32I-MEDIUM-NEXT:    seqz a2, s5
+; RV32I-MEDIUM-NEXT:    add s6, s6, a2
+; RV32I-MEDIUM-NEXT:    xor a2, s5, s2
+; RV32I-MEDIUM-NEXT:    add a1, a1, s4
+; RV32I-MEDIUM-NEXT:    xor a3, s6, s1
+; RV32I-MEDIUM-NEXT:    or a2, a2, a3
+; RV32I-MEDIUM-NEXT:    add s3, a0, s3
+; RV32I-MEDIUM-NEXT:    sltu s4, s3, a0
+; RV32I-MEDIUM-NEXT:    add s4, a1, s4
+; RV32I-MEDIUM-NEXT:    bnez a2, .LBB20_5
 ; RV32I-MEDIUM-NEXT:  .LBB20_6: # %for.cond.cleanup
 ; RV32I-MEDIUM-NEXT:    mv a0, s3
 ; RV32I-MEDIUM-NEXT:    mv a1, s4
@@ -1107,10 +1107,10 @@ define i64 @fold_addi_from_
diff erent_bb(i64 %k, i64 %n, ptr %a) nounwind {
 ; RV64I-LARGE-NEXT:  # %bb.1: # %for.body.lr.ph
 ; RV64I-LARGE-NEXT:    mv s0, a2
 ; RV64I-LARGE-NEXT:    mv s1, a1
+; RV64I-LARGE-NEXT:    li s2, 0
 ; RV64I-LARGE-NEXT:  .Lpcrel_hi14:
 ; RV64I-LARGE-NEXT:    auipc a1, %pcrel_hi(.LCPI20_0)
 ; RV64I-LARGE-NEXT:    ld s3, %pcrel_lo(.Lpcrel_hi14)(a1)
-; RV64I-LARGE-NEXT:    li s2, 0
 ; RV64I-LARGE-NEXT:    slli a0, a0, 4
 ; RV64I-LARGE-NEXT:    add s4, a2, a0
 ; RV64I-LARGE-NEXT:  .LBB20_2: # %for.body

diff  --git a/llvm/test/CodeGen/RISCV/fold-binop-into-select.ll b/llvm/test/CodeGen/RISCV/fold-binop-into-select.ll
index 1512db87b9311e..2036e7c7adfa81 100644
--- a/llvm/test/CodeGen/RISCV/fold-binop-into-select.ll
+++ b/llvm/test/CodeGen/RISCV/fold-binop-into-select.ll
@@ -33,8 +33,8 @@ define i64 @fold_binop_into_select_2(i1 %c, i64 %x) {
 ; CHECK-LABEL: fold_binop_into_select_2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    li a2, 2
-; CHECK-NEXT:    sub a2, a2, a1
 ; CHECK-NEXT:    slli a0, a0, 63
+; CHECK-NEXT:    sub a2, a2, a1
 ; CHECK-NEXT:    srai a0, a0, 63
 ; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/forced-atomics.ll b/llvm/test/CodeGen/RISCV/forced-atomics.ll
index 133d735a46ed7f..e7719dc70660bc 100644
--- a/llvm/test/CodeGen/RISCV/forced-atomics.ll
+++ b/llvm/test/CodeGen/RISCV/forced-atomics.ll
@@ -4531,29 +4531,29 @@ define i128 @rmw128(ptr %p) nounwind {
 ; RV32-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    mv s0, a1
-; RV32-NEXT:    lw a1, 0(a1)
-; RV32-NEXT:    lw a2, 4(s0)
-; RV32-NEXT:    lw a3, 8(s0)
-; RV32-NEXT:    lw a4, 12(s0)
+; RV32-NEXT:    lw a4, 0(a1)
+; RV32-NEXT:    lw a3, 4(a1)
+; RV32-NEXT:    lw a1, 8(a1)
+; RV32-NEXT:    lw a2, 12(s0)
 ; RV32-NEXT:    mv s1, a0
 ; RV32-NEXT:  .LBB62_1: # %atomicrmw.start
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    addi a0, a1, 1
-; RV32-NEXT:    seqz a5, a0
-; RV32-NEXT:    add a5, a2, a5
-; RV32-NEXT:    or a6, a0, a5
-; RV32-NEXT:    seqz a6, a6
-; RV32-NEXT:    add a6, a3, a6
-; RV32-NEXT:    sltu a7, a6, a3
-; RV32-NEXT:    add a7, a4, a7
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a2, 20(sp)
-; RV32-NEXT:    sw a3, 24(sp)
-; RV32-NEXT:    sw a4, 28(sp)
+; RV32-NEXT:    addi a0, a4, 1
+; RV32-NEXT:    sw a4, 16(sp)
+; RV32-NEXT:    sw a3, 20(sp)
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    sw a2, 28(sp)
+; RV32-NEXT:    seqz a4, a0
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    or a4, a0, a3
+; RV32-NEXT:    seqz a4, a4
+; RV32-NEXT:    add a4, a1, a4
+; RV32-NEXT:    sltu a1, a4, a1
+; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    sw a0, 0(sp)
-; RV32-NEXT:    sw a5, 4(sp)
-; RV32-NEXT:    sw a6, 8(sp)
-; RV32-NEXT:    sw a7, 12(sp)
+; RV32-NEXT:    sw a3, 4(sp)
+; RV32-NEXT:    sw a4, 8(sp)
+; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    li a0, 16
 ; RV32-NEXT:    addi a2, sp, 16
 ; RV32-NEXT:    mv a3, sp
@@ -4561,16 +4561,16 @@ define i128 @rmw128(ptr %p) nounwind {
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    mv a1, s0
 ; RV32-NEXT:    call __atomic_compare_exchange
-; RV32-NEXT:    lw a1, 16(sp)
-; RV32-NEXT:    lw a2, 20(sp)
-; RV32-NEXT:    lw a3, 24(sp)
-; RV32-NEXT:    lw a4, 28(sp)
+; RV32-NEXT:    lw a4, 16(sp)
+; RV32-NEXT:    lw a3, 20(sp)
+; RV32-NEXT:    lw a1, 24(sp)
+; RV32-NEXT:    lw a2, 28(sp)
 ; RV32-NEXT:    beqz a0, .LBB62_1
 ; RV32-NEXT:  # %bb.2: # %atomicrmw.end
-; RV32-NEXT:    sw a1, 0(s1)
-; RV32-NEXT:    sw a2, 4(s1)
-; RV32-NEXT:    sw a3, 8(s1)
-; RV32-NEXT:    sw a4, 12(s1)
+; RV32-NEXT:    sw a4, 0(s1)
+; RV32-NEXT:    sw a3, 4(s1)
+; RV32-NEXT:    sw a1, 8(s1)
+; RV32-NEXT:    sw a2, 12(s1)
 ; RV32-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/fp128.ll b/llvm/test/CodeGen/RISCV/fp128.ll
index 0bde85b54e5d15..581ee5cd2304df 100644
--- a/llvm/test/CodeGen/RISCV/fp128.ll
+++ b/llvm/test/CodeGen/RISCV/fp128.ll
@@ -14,19 +14,19 @@ define i32 @test_load_and_cmp() nounwind {
 ; RV32I-NEXT:    addi sp, sp, -48
 ; RV32I-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lui a0, %hi(x)
+; RV32I-NEXT:    lui a1, %hi(y)
 ; RV32I-NEXT:    lw a2, %lo(x)(a0)
 ; RV32I-NEXT:    lw a3, %lo(x+4)(a0)
 ; RV32I-NEXT:    lw a4, %lo(x+8)(a0)
 ; RV32I-NEXT:    lw a5, %lo(x+12)(a0)
-; RV32I-NEXT:    lui a0, %hi(y)
-; RV32I-NEXT:    lw a1, %lo(y)(a0)
-; RV32I-NEXT:    lw a6, %lo(y+4)(a0)
-; RV32I-NEXT:    lw a7, %lo(y+8)(a0)
-; RV32I-NEXT:    lw a0, %lo(y+12)(a0)
-; RV32I-NEXT:    sw a1, 8(sp)
+; RV32I-NEXT:    lw a0, %lo(y)(a1)
+; RV32I-NEXT:    lw a6, %lo(y+4)(a1)
+; RV32I-NEXT:    lw a7, %lo(y+8)(a1)
+; RV32I-NEXT:    lw a1, %lo(y+12)(a1)
+; RV32I-NEXT:    sw a0, 8(sp)
 ; RV32I-NEXT:    sw a6, 12(sp)
 ; RV32I-NEXT:    sw a7, 16(sp)
-; RV32I-NEXT:    sw a0, 20(sp)
+; RV32I-NEXT:    sw a1, 20(sp)
 ; RV32I-NEXT:    addi a0, sp, 24
 ; RV32I-NEXT:    addi a1, sp, 8
 ; RV32I-NEXT:    sw a2, 24(sp)
@@ -51,19 +51,19 @@ define i32 @test_add_and_fptosi() nounwind {
 ; RV32I-NEXT:    addi sp, sp, -80
 ; RV32I-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    lui a0, %hi(x)
+; RV32I-NEXT:    lui a1, %hi(y)
 ; RV32I-NEXT:    lw a3, %lo(x)(a0)
 ; RV32I-NEXT:    lw a4, %lo(x+4)(a0)
 ; RV32I-NEXT:    lw a5, %lo(x+8)(a0)
 ; RV32I-NEXT:    lw a6, %lo(x+12)(a0)
-; RV32I-NEXT:    lui a0, %hi(y)
-; RV32I-NEXT:    lw a1, %lo(y)(a0)
-; RV32I-NEXT:    lw a2, %lo(y+4)(a0)
-; RV32I-NEXT:    lw a7, %lo(y+8)(a0)
-; RV32I-NEXT:    lw a0, %lo(y+12)(a0)
-; RV32I-NEXT:    sw a1, 24(sp)
+; RV32I-NEXT:    lw a0, %lo(y)(a1)
+; RV32I-NEXT:    lw a2, %lo(y+4)(a1)
+; RV32I-NEXT:    lw a7, %lo(y+8)(a1)
+; RV32I-NEXT:    lw a1, %lo(y+12)(a1)
+; RV32I-NEXT:    sw a0, 24(sp)
 ; RV32I-NEXT:    sw a2, 28(sp)
 ; RV32I-NEXT:    sw a7, 32(sp)
-; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    sw a1, 36(sp)
 ; RV32I-NEXT:    addi a0, sp, 56
 ; RV32I-NEXT:    addi a1, sp, 40
 ; RV32I-NEXT:    addi a2, sp, 24

diff  --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index bbdfda5c1e10d5..c5c3b199447a9a 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -1282,8 +1282,8 @@ define i64 @utest_f64i64(double %x) {
 ; RV32IF-NEXT:    lw a2, 12(sp)
 ; RV32IF-NEXT:    lw a3, 8(sp)
 ; RV32IF-NEXT:    or a4, a1, a0
-; RV32IF-NEXT:    seqz a4, a4
 ; RV32IF-NEXT:    xori a0, a0, 1
+; RV32IF-NEXT:    seqz a4, a4
 ; RV32IF-NEXT:    or a0, a0, a1
 ; RV32IF-NEXT:    seqz a0, a0
 ; RV32IF-NEXT:    addi a0, a0, -1
@@ -1326,8 +1326,8 @@ define i64 @utest_f64i64(double %x) {
 ; RV32IFD-NEXT:    lw a2, 12(sp)
 ; RV32IFD-NEXT:    lw a3, 8(sp)
 ; RV32IFD-NEXT:    or a4, a1, a0
-; RV32IFD-NEXT:    seqz a4, a4
 ; RV32IFD-NEXT:    xori a0, a0, 1
+; RV32IFD-NEXT:    seqz a4, a4
 ; RV32IFD-NEXT:    or a0, a0, a1
 ; RV32IFD-NEXT:    seqz a0, a0
 ; RV32IFD-NEXT:    addi a0, a0, -1
@@ -1592,8 +1592,8 @@ define i64 @utest_f32i64(float %x) {
 ; RV32-NEXT:    lw a2, 12(sp)
 ; RV32-NEXT:    lw a3, 8(sp)
 ; RV32-NEXT:    or a4, a1, a0
-; RV32-NEXT:    seqz a4, a4
 ; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    seqz a4, a4
 ; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    seqz a0, a0
 ; RV32-NEXT:    addi a0, a0, -1
@@ -1853,8 +1853,8 @@ define i64 @utesth_f16i64(half %x) {
 ; RV32-NEXT:    lw a2, 12(sp)
 ; RV32-NEXT:    lw a3, 8(sp)
 ; RV32-NEXT:    or a4, a1, a0
-; RV32-NEXT:    seqz a4, a4
 ; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    seqz a4, a4
 ; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    seqz a0, a0
 ; RV32-NEXT:    addi a0, a0, -1
@@ -2168,8 +2168,8 @@ define i32 @ustest_f64i32_mm(double %x) {
 ; RV32IF-NEXT:    slti a2, a1, 1
 ; RV32IF-NEXT:  .LBB29_3: # %entry
 ; RV32IF-NEXT:    addi a3, a2, -1
-; RV32IF-NEXT:    or a0, a3, a0
 ; RV32IF-NEXT:    neg a2, a2
+; RV32IF-NEXT:    or a0, a3, a0
 ; RV32IF-NEXT:    and a1, a2, a1
 ; RV32IF-NEXT:    slti a1, a1, 0
 ; RV32IF-NEXT:    addi a1, a1, -1
@@ -2459,8 +2459,8 @@ define i32 @ustest_f16i32_mm(half %x) {
 ; RV32-NEXT:    slti a2, a1, 1
 ; RV32-NEXT:  .LBB35_3: # %entry
 ; RV32-NEXT:    addi a3, a2, -1
-; RV32-NEXT:    or a0, a3, a0
 ; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    or a0, a3, a0
 ; RV32-NEXT:    and a1, a2, a1
 ; RV32-NEXT:    slti a1, a1, 0
 ; RV32-NEXT:    addi a1, a1, -1
@@ -3216,8 +3216,8 @@ define i64 @utest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    lw a2, 12(sp)
 ; RV32IF-NEXT:    lw a3, 8(sp)
 ; RV32IF-NEXT:    or a4, a1, a0
-; RV32IF-NEXT:    seqz a4, a4
 ; RV32IF-NEXT:    xori a0, a0, 1
+; RV32IF-NEXT:    seqz a4, a4
 ; RV32IF-NEXT:    or a0, a0, a1
 ; RV32IF-NEXT:    seqz a0, a0
 ; RV32IF-NEXT:    addi a0, a0, -1
@@ -3260,8 +3260,8 @@ define i64 @utest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    lw a2, 12(sp)
 ; RV32IFD-NEXT:    lw a3, 8(sp)
 ; RV32IFD-NEXT:    or a4, a1, a0
-; RV32IFD-NEXT:    seqz a4, a4
 ; RV32IFD-NEXT:    xori a0, a0, 1
+; RV32IFD-NEXT:    seqz a4, a4
 ; RV32IFD-NEXT:    or a0, a0, a1
 ; RV32IFD-NEXT:    seqz a0, a0
 ; RV32IFD-NEXT:    addi a0, a0, -1
@@ -3335,11 +3335,11 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV64-NEXT:    li a2, 1
 ; RV64-NEXT:  .LBB47_2: # %entry
 ; RV64-NEXT:    slti a1, a1, 1
+; RV64-NEXT:    slti a2, a2, 0
 ; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    slti a1, a2, 0
-; RV64-NEXT:    addi a1, a1, -1
-; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    and a0, a2, a0
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    .cfi_restore ra
 ; RV64-NEXT:    addi sp, sp, 16
@@ -3484,8 +3484,8 @@ define i64 @utest_f32i64_mm(float %x) {
 ; RV32-NEXT:    lw a2, 12(sp)
 ; RV32-NEXT:    lw a3, 8(sp)
 ; RV32-NEXT:    or a4, a1, a0
-; RV32-NEXT:    seqz a4, a4
 ; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    seqz a4, a4
 ; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    seqz a0, a0
 ; RV32-NEXT:    addi a0, a0, -1
@@ -3573,11 +3573,11 @@ define i64 @ustest_f32i64_mm(float %x) {
 ; RV64-NEXT:    li a2, 1
 ; RV64-NEXT:  .LBB50_2: # %entry
 ; RV64-NEXT:    slti a1, a1, 1
+; RV64-NEXT:    slti a2, a2, 0
 ; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    slti a1, a2, 0
-; RV64-NEXT:    addi a1, a1, -1
-; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    and a0, a2, a0
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    .cfi_restore ra
 ; RV64-NEXT:    addi sp, sp, 16
@@ -3719,8 +3719,8 @@ define i64 @utesth_f16i64_mm(half %x) {
 ; RV32-NEXT:    lw a2, 12(sp)
 ; RV32-NEXT:    lw a3, 8(sp)
 ; RV32-NEXT:    or a4, a1, a0
-; RV32-NEXT:    seqz a4, a4
 ; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    seqz a4, a4
 ; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    seqz a0, a0
 ; RV32-NEXT:    addi a0, a0, -1
@@ -3811,11 +3811,11 @@ define i64 @ustest_f16i64_mm(half %x) {
 ; RV64-NEXT:    li a2, 1
 ; RV64-NEXT:  .LBB53_2: # %entry
 ; RV64-NEXT:    slti a1, a1, 1
+; RV64-NEXT:    slti a2, a2, 0
 ; RV64-NEXT:    neg a1, a1
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    slti a1, a2, 0
-; RV64-NEXT:    addi a1, a1, -1
-; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    and a0, a2, a0
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    .cfi_restore ra
 ; RV64-NEXT:    addi sp, sp, 16

diff  --git a/llvm/test/CodeGen/RISCV/fpenv.ll b/llvm/test/CodeGen/RISCV/fpenv.ll
index 48aec0b70b0d40..895effb4ce49bb 100644
--- a/llvm/test/CodeGen/RISCV/fpenv.ll
+++ b/llvm/test/CodeGen/RISCV/fpenv.ll
@@ -6,8 +6,8 @@ define i32 @func_01() {
 ; RV32IF-LABEL: func_01:
 ; RV32IF:       # %bb.0:
 ; RV32IF-NEXT:    frrm a0
-; RV32IF-NEXT:    slli a0, a0, 2
 ; RV32IF-NEXT:    lui a1, 66
+; RV32IF-NEXT:    slli a0, a0, 2
 ; RV32IF-NEXT:    addi a1, a1, 769
 ; RV32IF-NEXT:    srl a0, a1, a0
 ; RV32IF-NEXT:    andi a0, a0, 7
@@ -16,8 +16,8 @@ define i32 @func_01() {
 ; RV64IF-LABEL: func_01:
 ; RV64IF:       # %bb.0:
 ; RV64IF-NEXT:    frrm a0
-; RV64IF-NEXT:    slli a0, a0, 2
 ; RV64IF-NEXT:    lui a1, 66
+; RV64IF-NEXT:    slli a0, a0, 2
 ; RV64IF-NEXT:    addiw a1, a1, 769
 ; RV64IF-NEXT:    srl a0, a1, a0
 ; RV64IF-NEXT:    andi a0, a0, 7
@@ -40,8 +40,8 @@ define void @func_02(i32 %rm) {
 ; RV64IF-LABEL: func_02:
 ; RV64IF:       # %bb.0:
 ; RV64IF-NEXT:    slli a0, a0, 32
-; RV64IF-NEXT:    srli a0, a0, 30
 ; RV64IF-NEXT:    lui a1, 66
+; RV64IF-NEXT:    srli a0, a0, 30
 ; RV64IF-NEXT:    addiw a1, a1, 769
 ; RV64IF-NEXT:    srl a0, a1, a0
 ; RV64IF-NEXT:    andi a0, a0, 7

diff  --git a/llvm/test/CodeGen/RISCV/ghccc-rv32.ll b/llvm/test/CodeGen/RISCV/ghccc-rv32.ll
index 0f9511125adbae..c4c14c6cb8726e 100644
--- a/llvm/test/CodeGen/RISCV/ghccc-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/ghccc-rv32.ll
@@ -33,50 +33,50 @@ define ghccc void @foo() nounwind {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a0, %hi(d6)
+; CHECK-NEXT:    lui a1, %hi(d5)
+; CHECK-NEXT:    lui a2, %hi(d4)
+; CHECK-NEXT:    lui a3, %hi(d3)
+; CHECK-NEXT:    lui a4, %hi(d2)
+; CHECK-NEXT:    lui a5, %hi(d1)
+; CHECK-NEXT:    lui a6, %hi(f6)
+; CHECK-NEXT:    lui a7, %hi(f5)
+; CHECK-NEXT:    lui t0, %hi(f4)
+; CHECK-NEXT:    lui t1, %hi(f3)
+; CHECK-NEXT:    lui t2, %hi(f2)
 ; CHECK-NEXT:    fld fs11, %lo(d6)(a0)
-; CHECK-NEXT:    lui a0, %hi(d5)
-; CHECK-NEXT:    fld fs10, %lo(d5)(a0)
-; CHECK-NEXT:    lui a0, %hi(d4)
-; CHECK-NEXT:    fld fs9, %lo(d4)(a0)
-; CHECK-NEXT:    lui a0, %hi(d3)
-; CHECK-NEXT:    fld fs8, %lo(d3)(a0)
-; CHECK-NEXT:    lui a0, %hi(d2)
-; CHECK-NEXT:    fld fs7, %lo(d2)(a0)
-; CHECK-NEXT:    lui a0, %hi(d1)
-; CHECK-NEXT:    fld fs6, %lo(d1)(a0)
-; CHECK-NEXT:    lui a0, %hi(f6)
-; CHECK-NEXT:    flw fs5, %lo(f6)(a0)
-; CHECK-NEXT:    lui a0, %hi(f5)
-; CHECK-NEXT:    flw fs4, %lo(f5)(a0)
-; CHECK-NEXT:    lui a0, %hi(f4)
-; CHECK-NEXT:    flw fs3, %lo(f4)(a0)
-; CHECK-NEXT:    lui a0, %hi(f3)
-; CHECK-NEXT:    flw fs2, %lo(f3)(a0)
-; CHECK-NEXT:    lui a0, %hi(f2)
-; CHECK-NEXT:    flw fs1, %lo(f2)(a0)
 ; CHECK-NEXT:    lui a0, %hi(f1)
+; CHECK-NEXT:    fld fs10, %lo(d5)(a1)
+; CHECK-NEXT:    lui a1, %hi(splim)
+; CHECK-NEXT:    fld fs9, %lo(d4)(a2)
+; CHECK-NEXT:    lui a2, %hi(r7)
+; CHECK-NEXT:    fld fs8, %lo(d3)(a3)
+; CHECK-NEXT:    lui a3, %hi(r6)
+; CHECK-NEXT:    fld fs7, %lo(d2)(a4)
+; CHECK-NEXT:    lui a4, %hi(r5)
+; CHECK-NEXT:    fld fs6, %lo(d1)(a5)
+; CHECK-NEXT:    lui a5, %hi(r4)
+; CHECK-NEXT:    flw fs5, %lo(f6)(a6)
+; CHECK-NEXT:    lui a6, %hi(r3)
+; CHECK-NEXT:    flw fs4, %lo(f5)(a7)
+; CHECK-NEXT:    lui a7, %hi(r2)
+; CHECK-NEXT:    flw fs3, %lo(f4)(t0)
+; CHECK-NEXT:    lui t0, %hi(r1)
+; CHECK-NEXT:    flw fs2, %lo(f3)(t1)
+; CHECK-NEXT:    lui t1, %hi(hp)
+; CHECK-NEXT:    flw fs1, %lo(f2)(t2)
+; CHECK-NEXT:    lui t2, %hi(sp)
 ; CHECK-NEXT:    flw fs0, %lo(f1)(a0)
-; CHECK-NEXT:    lui a0, %hi(splim)
-; CHECK-NEXT:    lw s11, %lo(splim)(a0)
-; CHECK-NEXT:    lui a0, %hi(r7)
-; CHECK-NEXT:    lw s10, %lo(r7)(a0)
-; CHECK-NEXT:    lui a0, %hi(r6)
-; CHECK-NEXT:    lw s9, %lo(r6)(a0)
-; CHECK-NEXT:    lui a0, %hi(r5)
-; CHECK-NEXT:    lw s8, %lo(r5)(a0)
-; CHECK-NEXT:    lui a0, %hi(r4)
-; CHECK-NEXT:    lw s7, %lo(r4)(a0)
-; CHECK-NEXT:    lui a0, %hi(r3)
-; CHECK-NEXT:    lw s6, %lo(r3)(a0)
-; CHECK-NEXT:    lui a0, %hi(r2)
-; CHECK-NEXT:    lw s5, %lo(r2)(a0)
-; CHECK-NEXT:    lui a0, %hi(r1)
-; CHECK-NEXT:    lw s4, %lo(r1)(a0)
-; CHECK-NEXT:    lui a0, %hi(hp)
-; CHECK-NEXT:    lw s3, %lo(hp)(a0)
-; CHECK-NEXT:    lui a0, %hi(sp)
-; CHECK-NEXT:    lw s2, %lo(sp)(a0)
 ; CHECK-NEXT:    lui a0, %hi(base)
+; CHECK-NEXT:    lw s11, %lo(splim)(a1)
+; CHECK-NEXT:    lw s10, %lo(r7)(a2)
+; CHECK-NEXT:    lw s9, %lo(r6)(a3)
+; CHECK-NEXT:    lw s8, %lo(r5)(a4)
+; CHECK-NEXT:    lw s7, %lo(r4)(a5)
+; CHECK-NEXT:    lw s6, %lo(r3)(a6)
+; CHECK-NEXT:    lw s5, %lo(r2)(a7)
+; CHECK-NEXT:    lw s4, %lo(r1)(t0)
+; CHECK-NEXT:    lw s3, %lo(hp)(t1)
+; CHECK-NEXT:    lw s2, %lo(sp)(t2)
 ; CHECK-NEXT:    lw s1, %lo(base)(a0)
 ; CHECK-NEXT:    tail bar
 entry:

diff  --git a/llvm/test/CodeGen/RISCV/ghccc-rv64.ll b/llvm/test/CodeGen/RISCV/ghccc-rv64.ll
index 79afd4bc375d53..8e3fd2ca709aa5 100644
--- a/llvm/test/CodeGen/RISCV/ghccc-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/ghccc-rv64.ll
@@ -33,50 +33,50 @@ define ghccc void @foo() nounwind {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a0, %hi(d6)
+; CHECK-NEXT:    lui a1, %hi(d5)
+; CHECK-NEXT:    lui a2, %hi(d4)
+; CHECK-NEXT:    lui a3, %hi(d3)
+; CHECK-NEXT:    lui a4, %hi(d2)
+; CHECK-NEXT:    lui a5, %hi(d1)
+; CHECK-NEXT:    lui a6, %hi(f6)
+; CHECK-NEXT:    lui a7, %hi(f5)
+; CHECK-NEXT:    lui t0, %hi(f4)
+; CHECK-NEXT:    lui t1, %hi(f3)
+; CHECK-NEXT:    lui t2, %hi(f2)
 ; CHECK-NEXT:    fld fs11, %lo(d6)(a0)
-; CHECK-NEXT:    lui a0, %hi(d5)
-; CHECK-NEXT:    fld fs10, %lo(d5)(a0)
-; CHECK-NEXT:    lui a0, %hi(d4)
-; CHECK-NEXT:    fld fs9, %lo(d4)(a0)
-; CHECK-NEXT:    lui a0, %hi(d3)
-; CHECK-NEXT:    fld fs8, %lo(d3)(a0)
-; CHECK-NEXT:    lui a0, %hi(d2)
-; CHECK-NEXT:    fld fs7, %lo(d2)(a0)
-; CHECK-NEXT:    lui a0, %hi(d1)
-; CHECK-NEXT:    fld fs6, %lo(d1)(a0)
-; CHECK-NEXT:    lui a0, %hi(f6)
-; CHECK-NEXT:    flw fs5, %lo(f6)(a0)
-; CHECK-NEXT:    lui a0, %hi(f5)
-; CHECK-NEXT:    flw fs4, %lo(f5)(a0)
-; CHECK-NEXT:    lui a0, %hi(f4)
-; CHECK-NEXT:    flw fs3, %lo(f4)(a0)
-; CHECK-NEXT:    lui a0, %hi(f3)
-; CHECK-NEXT:    flw fs2, %lo(f3)(a0)
-; CHECK-NEXT:    lui a0, %hi(f2)
-; CHECK-NEXT:    flw fs1, %lo(f2)(a0)
 ; CHECK-NEXT:    lui a0, %hi(f1)
+; CHECK-NEXT:    fld fs10, %lo(d5)(a1)
+; CHECK-NEXT:    lui a1, %hi(splim)
+; CHECK-NEXT:    fld fs9, %lo(d4)(a2)
+; CHECK-NEXT:    lui a2, %hi(r7)
+; CHECK-NEXT:    fld fs8, %lo(d3)(a3)
+; CHECK-NEXT:    lui a3, %hi(r6)
+; CHECK-NEXT:    fld fs7, %lo(d2)(a4)
+; CHECK-NEXT:    lui a4, %hi(r5)
+; CHECK-NEXT:    fld fs6, %lo(d1)(a5)
+; CHECK-NEXT:    lui a5, %hi(r4)
+; CHECK-NEXT:    flw fs5, %lo(f6)(a6)
+; CHECK-NEXT:    lui a6, %hi(r3)
+; CHECK-NEXT:    flw fs4, %lo(f5)(a7)
+; CHECK-NEXT:    lui a7, %hi(r2)
+; CHECK-NEXT:    flw fs3, %lo(f4)(t0)
+; CHECK-NEXT:    lui t0, %hi(r1)
+; CHECK-NEXT:    flw fs2, %lo(f3)(t1)
+; CHECK-NEXT:    lui t1, %hi(hp)
+; CHECK-NEXT:    flw fs1, %lo(f2)(t2)
+; CHECK-NEXT:    lui t2, %hi(sp)
 ; CHECK-NEXT:    flw fs0, %lo(f1)(a0)
-; CHECK-NEXT:    lui a0, %hi(splim)
-; CHECK-NEXT:    ld s11, %lo(splim)(a0)
-; CHECK-NEXT:    lui a0, %hi(r7)
-; CHECK-NEXT:    ld s10, %lo(r7)(a0)
-; CHECK-NEXT:    lui a0, %hi(r6)
-; CHECK-NEXT:    ld s9, %lo(r6)(a0)
-; CHECK-NEXT:    lui a0, %hi(r5)
-; CHECK-NEXT:    ld s8, %lo(r5)(a0)
-; CHECK-NEXT:    lui a0, %hi(r4)
-; CHECK-NEXT:    ld s7, %lo(r4)(a0)
-; CHECK-NEXT:    lui a0, %hi(r3)
-; CHECK-NEXT:    ld s6, %lo(r3)(a0)
-; CHECK-NEXT:    lui a0, %hi(r2)
-; CHECK-NEXT:    ld s5, %lo(r2)(a0)
-; CHECK-NEXT:    lui a0, %hi(r1)
-; CHECK-NEXT:    ld s4, %lo(r1)(a0)
-; CHECK-NEXT:    lui a0, %hi(hp)
-; CHECK-NEXT:    ld s3, %lo(hp)(a0)
-; CHECK-NEXT:    lui a0, %hi(sp)
-; CHECK-NEXT:    ld s2, %lo(sp)(a0)
 ; CHECK-NEXT:    lui a0, %hi(base)
+; CHECK-NEXT:    ld s11, %lo(splim)(a1)
+; CHECK-NEXT:    ld s10, %lo(r7)(a2)
+; CHECK-NEXT:    ld s9, %lo(r6)(a3)
+; CHECK-NEXT:    ld s8, %lo(r5)(a4)
+; CHECK-NEXT:    ld s7, %lo(r4)(a5)
+; CHECK-NEXT:    ld s6, %lo(r3)(a6)
+; CHECK-NEXT:    ld s5, %lo(r2)(a7)
+; CHECK-NEXT:    ld s4, %lo(r1)(t0)
+; CHECK-NEXT:    ld s3, %lo(hp)(t1)
+; CHECK-NEXT:    ld s2, %lo(sp)(t2)
 ; CHECK-NEXT:    ld s1, %lo(base)(a0)
 ; CHECK-NEXT:    tail bar
 entry:

diff  --git a/llvm/test/CodeGen/RISCV/ghccc-without-f-reg.ll b/llvm/test/CodeGen/RISCV/ghccc-without-f-reg.ll
index 6437beae090154..abc555b994a3b9 100644
--- a/llvm/test/CodeGen/RISCV/ghccc-without-f-reg.ll
+++ b/llvm/test/CodeGen/RISCV/ghccc-without-f-reg.ll
@@ -14,17 +14,17 @@ define ghccc void @caller_float() nounwind {
 ; CHECK-LABEL: caller_float:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a0, %hi(f6)
+; CHECK-NEXT:    lui a1, %hi(f5)
+; CHECK-NEXT:    lui a2, %hi(f4)
+; CHECK-NEXT:    lui a3, %hi(f3)
+; CHECK-NEXT:    lui a4, %hi(f2)
+; CHECK-NEXT:    lui a5, %hi(f1)
 ; CHECK-NEXT:    lw s6, %lo(f6)(a0)
-; CHECK-NEXT:    lui a0, %hi(f5)
-; CHECK-NEXT:    lw s5, %lo(f5)(a0)
-; CHECK-NEXT:    lui a0, %hi(f4)
-; CHECK-NEXT:    lw s4, %lo(f4)(a0)
-; CHECK-NEXT:    lui a0, %hi(f3)
-; CHECK-NEXT:    lw s3, %lo(f3)(a0)
-; CHECK-NEXT:    lui a0, %hi(f2)
-; CHECK-NEXT:    lw s2, %lo(f2)(a0)
-; CHECK-NEXT:    lui a0, %hi(f1)
-; CHECK-NEXT:    lw s1, %lo(f1)(a0)
+; CHECK-NEXT:    lw s5, %lo(f5)(a1)
+; CHECK-NEXT:    lw s4, %lo(f4)(a2)
+; CHECK-NEXT:    lw s3, %lo(f3)(a3)
+; CHECK-NEXT:    lw s2, %lo(f2)(a4)
+; CHECK-NEXT:    lw s1, %lo(f1)(a5)
 ; CHECK-NEXT:    tail callee_float
 entry:
   %0  = load float, ptr @f6
@@ -50,17 +50,17 @@ define ghccc void @caller_double() nounwind {
 ; CHECK-LABEL: caller_double:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a0, %hi(d6)
+; CHECK-NEXT:    lui a1, %hi(d5)
+; CHECK-NEXT:    lui a2, %hi(d4)
+; CHECK-NEXT:    lui a3, %hi(d3)
+; CHECK-NEXT:    lui a4, %hi(d2)
+; CHECK-NEXT:    lui a5, %hi(d1)
 ; CHECK-NEXT:    ld s6, %lo(d6)(a0)
-; CHECK-NEXT:    lui a0, %hi(d5)
-; CHECK-NEXT:    ld s5, %lo(d5)(a0)
-; CHECK-NEXT:    lui a0, %hi(d4)
-; CHECK-NEXT:    ld s4, %lo(d4)(a0)
-; CHECK-NEXT:    lui a0, %hi(d3)
-; CHECK-NEXT:    ld s3, %lo(d3)(a0)
-; CHECK-NEXT:    lui a0, %hi(d2)
-; CHECK-NEXT:    ld s2, %lo(d2)(a0)
-; CHECK-NEXT:    lui a0, %hi(d1)
-; CHECK-NEXT:    ld s1, %lo(d1)(a0)
+; CHECK-NEXT:    ld s5, %lo(d5)(a1)
+; CHECK-NEXT:    ld s4, %lo(d4)(a2)
+; CHECK-NEXT:    ld s3, %lo(d3)(a3)
+; CHECK-NEXT:    ld s2, %lo(d2)(a4)
+; CHECK-NEXT:    ld s1, %lo(d1)(a5)
 ; CHECK-NEXT:    tail callee_double
 entry:
   %0  = load double, ptr @d6

diff  --git a/llvm/test/CodeGen/RISCV/global-merge.ll b/llvm/test/CodeGen/RISCV/global-merge.ll
index 633ba719c6a305..9dde032b69f8b1 100644
--- a/llvm/test/CodeGen/RISCV/global-merge.ll
+++ b/llvm/test/CodeGen/RISCV/global-merge.ll
@@ -23,12 +23,12 @@
 define void @f1(i32 %a) nounwind {
 ; CHECK-LABEL: f1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, %hi(eg1)
+; CHECK-NEXT:    sw a0, %lo(eg1)(a1)
 ; CHECK-NEXT:    lui a1, %hi(.L_MergedGlobals)
 ; CHECK-NEXT:    sw a0, %lo(.L_MergedGlobals)(a1)
 ; CHECK-NEXT:    addi a1, a1, %lo(.L_MergedGlobals)
 ; CHECK-NEXT:    sw a0, 4(a1)
-; CHECK-NEXT:    lui a1, %hi(eg1)
-; CHECK-NEXT:    sw a0, %lo(eg1)(a1)
 ; CHECK-NEXT:    lui a1, %hi(eg2)
 ; CHECK-NEXT:    sw a0, %lo(eg2)(a1)
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/half-arith-strict.ll b/llvm/test/CodeGen/RISCV/half-arith-strict.ll
index 4e4aad7309791c..636739cf38984a 100644
--- a/llvm/test/CodeGen/RISCV/half-arith-strict.ll
+++ b/llvm/test/CodeGen/RISCV/half-arith-strict.ll
@@ -243,28 +243,28 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZFHMIN:       # %bb.0:
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa2
 ; CHECK-ZFHMIN-NEXT:    fmv.w.x fa4, zero
+; CHECK-ZFHMIN-NEXT:    lui a0, 1048568
+; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa1
 ; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECK-ZFHMIN-NEXT:    lui a1, 1048568
-; CHECK-ZFHMIN-NEXT:    xor a0, a0, a1
+; CHECK-ZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECK-ZFHMIN-NEXT:    xor a0, a1, a0
 ; CHECK-ZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa1
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa0
-; CHECK-ZFHMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa0
+; CHECK-ZFHMIN-NEXT:    fmadd.s fa5, fa4, fa3, fa5
 ; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECK-ZFHMIN-NEXT:    ret
 ;
 ; CHECK-ZHINXMIN-LABEL: fmsub_h:
 ; CHECK-ZHINXMIN:       # %bb.0:
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
+; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECK-ZHINXMIN-NEXT:    fadd.s a2, a2, zero
 ; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECK-ZHINXMIN-NEXT:    xor a2, a2, a3
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
 ; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
@@ -295,17 +295,17 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZFHMIN:       # %bb.0:
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; CHECK-ZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa2
+; CHECK-ZFHMIN-NEXT:    lui a0, 1048568
+; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-ZFHMIN-NEXT:    fadd.s fa4, fa3, fa4
+; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECK-ZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECK-ZFHMIN-NEXT:    lui a1, 1048568
-; CHECK-ZFHMIN-NEXT:    xor a0, a0, a1
-; CHECK-ZFHMIN-NEXT:    fmv.h.x fa5, a0
-; CHECK-ZFHMIN-NEXT:    fmv.x.h a0, fa4
-; CHECK-ZFHMIN-NEXT:    xor a0, a0, a1
+; CHECK-ZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECK-ZFHMIN-NEXT:    fmv.x.h a2, fa4
+; CHECK-ZFHMIN-NEXT:    xor a1, a1, a0
+; CHECK-ZFHMIN-NEXT:    xor a0, a2, a0
+; CHECK-ZFHMIN-NEXT:    fmv.h.x fa5, a1
 ; CHECK-ZFHMIN-NEXT:    fmv.h.x fa4, a0
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa4
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
@@ -317,12 +317,12 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZHINXMIN-LABEL: fnmadd_h:
 ; CHECK-ZHINXMIN:       # %bb.0:
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fadd.s a0, a0, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
+; CHECK-ZHINXMIN-NEXT:    fadd.s a0, a0, zero
 ; CHECK-ZHINXMIN-NEXT:    fadd.s a2, a2, zero
+; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECK-ZHINXMIN-NEXT:    xor a0, a0, a3
 ; CHECK-ZHINXMIN-NEXT:    xor a2, a2, a3
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
@@ -359,17 +359,17 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZFHMIN:       # %bb.0:
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa1
 ; CHECK-ZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa2
+; CHECK-ZFHMIN-NEXT:    lui a0, 1048568
+; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-ZFHMIN-NEXT:    fadd.s fa4, fa3, fa4
+; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECK-ZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECK-ZFHMIN-NEXT:    lui a1, 1048568
-; CHECK-ZFHMIN-NEXT:    xor a0, a0, a1
-; CHECK-ZFHMIN-NEXT:    fmv.h.x fa5, a0
-; CHECK-ZFHMIN-NEXT:    fmv.x.h a0, fa4
-; CHECK-ZFHMIN-NEXT:    xor a0, a0, a1
+; CHECK-ZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECK-ZFHMIN-NEXT:    fmv.x.h a2, fa4
+; CHECK-ZFHMIN-NEXT:    xor a1, a1, a0
+; CHECK-ZFHMIN-NEXT:    xor a0, a2, a0
+; CHECK-ZFHMIN-NEXT:    fmv.h.x fa5, a1
 ; CHECK-ZFHMIN-NEXT:    fmv.h.x fa4, a0
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa4
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
@@ -381,12 +381,12 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZHINXMIN-LABEL: fnmadd_h_2:
 ; CHECK-ZHINXMIN:       # %bb.0:
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fadd.s a1, a1, zero
-; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
+; CHECK-ZHINXMIN-NEXT:    fadd.s a1, a1, zero
 ; CHECK-ZHINXMIN-NEXT:    fadd.s a2, a2, zero
+; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
 ; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECK-ZHINXMIN-NEXT:    xor a1, a1, a3
 ; CHECK-ZHINXMIN-NEXT:    xor a2, a2, a3
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
@@ -421,28 +421,28 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZFHMIN:       # %bb.0:
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; CHECK-ZFHMIN-NEXT:    fmv.w.x fa4, zero
+; CHECK-ZFHMIN-NEXT:    lui a0, 1048568
+; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa2
 ; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECK-ZFHMIN-NEXT:    lui a1, 1048568
-; CHECK-ZFHMIN-NEXT:    xor a0, a0, a1
+; CHECK-ZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECK-ZFHMIN-NEXT:    xor a0, a1, a0
 ; CHECK-ZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa2
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa1
-; CHECK-ZFHMIN-NEXT:    fmadd.s fa5, fa5, fa3, fa4
+; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa1
+; CHECK-ZFHMIN-NEXT:    fmadd.s fa5, fa5, fa4, fa3
 ; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECK-ZFHMIN-NEXT:    ret
 ;
 ; CHECK-ZHINXMIN-LABEL: fnmsub_h:
 ; CHECK-ZHINXMIN:       # %bb.0:
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
+; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
 ; CHECK-ZHINXMIN-NEXT:    fadd.s a0, a0, zero
 ; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECK-ZHINXMIN-NEXT:    xor a0, a0, a3
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
 ; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0
@@ -471,28 +471,28 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind strictfp {
 ; CHECK-ZFHMIN:       # %bb.0:
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa1
 ; CHECK-ZFHMIN-NEXT:    fmv.w.x fa4, zero
+; CHECK-ZFHMIN-NEXT:    lui a0, 1048568
+; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa2
 ; CHECK-ZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECK-ZFHMIN-NEXT:    lui a1, 1048568
-; CHECK-ZFHMIN-NEXT:    xor a0, a0, a1
+; CHECK-ZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECK-ZFHMIN-NEXT:    xor a0, a1, a0
 ; CHECK-ZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa2
-; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa3, fa0
-; CHECK-ZFHMIN-NEXT:    fmadd.s fa5, fa3, fa5, fa4
+; CHECK-ZFHMIN-NEXT:    fcvt.s.h fa4, fa0
+; CHECK-ZFHMIN-NEXT:    fmadd.s fa5, fa4, fa5, fa3
 ; CHECK-ZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECK-ZFHMIN-NEXT:    ret
 ;
 ; CHECK-ZHINXMIN-LABEL: fnmsub_h_2:
 ; CHECK-ZHINXMIN:       # %bb.0:
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
+; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
 ; CHECK-ZHINXMIN-NEXT:    fadd.s a1, a1, zero
 ; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECK-ZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECK-ZHINXMIN-NEXT:    xor a1, a1, a3
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a2, a2
 ; CHECK-ZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK-ZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
 ; CHECK-ZHINXMIN-NEXT:    fcvt.h.s a0, a0

diff  --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll
index 4c2deafdc7e668..a218e89948d4b4 100644
--- a/llvm/test/CodeGen/RISCV/half-arith.ll
+++ b/llvm/test/CodeGen/RISCV/half-arith.ll
@@ -425,8 +425,8 @@ define half @fsgnj_h(half %a, half %b) nounwind {
 ; RV32I-LABEL: fsgnj_h:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 1048568
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 17
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    srli a0, a0, 17
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -434,8 +434,8 @@ define half @fsgnj_h(half %a, half %b) nounwind {
 ; RV64I-LABEL: fsgnj_h:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 1048568
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 49
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    srli a0, a0, 49
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -469,8 +469,8 @@ define half @fsgnj_h(half %a, half %b) nounwind {
 ; RV32IZHINXMIN-NEXT:    # kill: def $x11_h killed $x11_h def $x11
 ; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32IZHINXMIN-NEXT:    lui a2, 1048568
-; RV32IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
+; RV32IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    or a0, a0, a1
 ; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
@@ -481,8 +481,8 @@ define half @fsgnj_h(half %a, half %b) nounwind {
 ; RV64IZHINXMIN-NEXT:    # kill: def $x11_h killed $x11_h def $x11
 ; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64IZHINXMIN-NEXT:    lui a2, 1048568
-; RV64IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
+; RV64IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    or a0, a0, a1
 ; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
@@ -573,11 +573,11 @@ define i32 @fneg_h(half %a, half %b) nounwind {
 ; CHECKIZFHMIN-LABEL: fneg_h:
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa0
+; CHECKIZFHMIN-NEXT:    lui a0, 1048568
 ; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECKIZFHMIN-NEXT:    lui a1, 1048568
-; CHECKIZFHMIN-NEXT:    xor a0, a0, a1
+; CHECKIZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECKIZFHMIN-NEXT:    xor a0, a1, a0
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa4, a0
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
@@ -587,9 +587,9 @@ define i32 @fneg_h(half %a, half %b) nounwind {
 ; CHECKIZHINXMIN-LABEL: fneg_h:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    lui a1, 1048568
 ; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKIZHINXMIN-NEXT:    lui a1, 1048568
 ; CHECKIZHINXMIN-NEXT:    xor a1, a0, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
@@ -644,8 +644,8 @@ define half @fsgnjn_h(half %a, half %b) nounwind {
 ; RV32I-NEXT:    xor a0, a0, a1
 ; RV32I-NEXT:    call __truncsfhf2
 ; RV32I-NEXT:    lui a1, 1048568
-; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli s1, s1, 17
+; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    srli s1, s1, 17
 ; RV32I-NEXT:    or a0, s1, a0
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -683,8 +683,8 @@ define half @fsgnjn_h(half %a, half %b) nounwind {
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    call __truncsfhf2
 ; RV64I-NEXT:    lui a1, 1048568
-; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli s1, s1, 49
+; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    srli s1, s1, 49
 ; RV64I-NEXT:    or a0, s1, a0
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -699,12 +699,12 @@ define half @fsgnjn_h(half %a, half %b) nounwind {
 ; RV32IZFHMIN:       # %bb.0:
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
+; RV32IZFHMIN-NEXT:    lui a0, 1048568
 ; RV32IZFHMIN-NEXT:    fadd.s fa5, fa4, fa5
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; RV32IZFHMIN-NEXT:    fmv.x.h a0, fa5
-; RV32IZFHMIN-NEXT:    not a0, a0
-; RV32IZFHMIN-NEXT:    lui a1, 1048568
-; RV32IZFHMIN-NEXT:    and a0, a0, a1
+; RV32IZFHMIN-NEXT:    fmv.x.h a1, fa5
+; RV32IZFHMIN-NEXT:    not a1, a1
+; RV32IZFHMIN-NEXT:    and a0, a1, a0
 ; RV32IZFHMIN-NEXT:    fmv.x.h a1, fa0
 ; RV32IZFHMIN-NEXT:    slli a1, a1, 17
 ; RV32IZFHMIN-NEXT:    srli a1, a1, 17
@@ -716,12 +716,12 @@ define half @fsgnjn_h(half %a, half %b) nounwind {
 ; RV64IZFHMIN:       # %bb.0:
 ; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
 ; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
+; RV64IZFHMIN-NEXT:    lui a0, 1048568
 ; RV64IZFHMIN-NEXT:    fadd.s fa5, fa4, fa5
 ; RV64IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; RV64IZFHMIN-NEXT:    fmv.x.h a0, fa5
-; RV64IZFHMIN-NEXT:    not a0, a0
-; RV64IZFHMIN-NEXT:    lui a1, 1048568
-; RV64IZFHMIN-NEXT:    and a0, a0, a1
+; RV64IZFHMIN-NEXT:    fmv.x.h a1, fa5
+; RV64IZFHMIN-NEXT:    not a1, a1
+; RV64IZFHMIN-NEXT:    and a0, a1, a0
 ; RV64IZFHMIN-NEXT:    fmv.x.h a1, fa0
 ; RV64IZFHMIN-NEXT:    slli a1, a1, 49
 ; RV64IZFHMIN-NEXT:    srli a1, a1, 49
@@ -735,11 +735,11 @@ define half @fsgnjn_h(half %a, half %b) nounwind {
 ; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; RV32IZHINXMIN-NEXT:    fcvt.s.h a2, a0
 ; RV32IZHINXMIN-NEXT:    fadd.s a1, a2, a1
+; RV32IZHINXMIN-NEXT:    lui a2, 1048568
+; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    fcvt.h.s a1, a1
 ; RV32IZHINXMIN-NEXT:    not a1, a1
-; RV32IZHINXMIN-NEXT:    lui a2, 1048568
 ; RV32IZHINXMIN-NEXT:    and a1, a1, a2
-; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    or a0, a0, a1
 ; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
@@ -751,11 +751,11 @@ define half @fsgnjn_h(half %a, half %b) nounwind {
 ; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; RV64IZHINXMIN-NEXT:    fcvt.s.h a2, a0
 ; RV64IZHINXMIN-NEXT:    fadd.s a1, a2, a1
+; RV64IZHINXMIN-NEXT:    lui a2, 1048568
+; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    fcvt.h.s a1, a1
 ; RV64IZHINXMIN-NEXT:    not a1, a1
-; RV64IZHINXMIN-NEXT:    lui a2, 1048568
 ; RV64IZHINXMIN-NEXT:    and a1, a1, a2
-; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    or a0, a0, a1
 ; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
@@ -1298,28 +1298,28 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa2
 ; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
+; CHECKIZFHMIN-NEXT:    lui a0, 1048568
+; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa1
 ; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECKIZFHMIN-NEXT:    lui a1, 1048568
-; CHECKIZFHMIN-NEXT:    xor a0, a0, a1
+; CHECKIZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECKIZFHMIN-NEXT:    xor a0, a1, a0
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa1
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa0
-; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa3, fa4, fa5
+; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa0
+; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa4, fa3, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECKIZFHMIN-NEXT:    ret
 ;
 ; CHECKIZHINXMIN-LABEL: fmsub_h:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fadd.s a2, a2, zero
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECKIZHINXMIN-NEXT:    xor a2, a2, a3
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECKIZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
@@ -1466,17 +1466,17 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa2
+; CHECKIZFHMIN-NEXT:    lui a0, 1048568
+; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECKIZFHMIN-NEXT:    fadd.s fa4, fa3, fa4
+; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECKIZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECKIZFHMIN-NEXT:    lui a1, 1048568
-; CHECKIZFHMIN-NEXT:    xor a0, a0, a1
-; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a0
-; CHECKIZFHMIN-NEXT:    fmv.x.h a0, fa4
-; CHECKIZFHMIN-NEXT:    xor a0, a0, a1
+; CHECKIZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECKIZFHMIN-NEXT:    fmv.x.h a2, fa4
+; CHECKIZFHMIN-NEXT:    xor a1, a1, a0
+; CHECKIZFHMIN-NEXT:    xor a0, a2, a0
+; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a1
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa4, a0
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
@@ -1488,12 +1488,12 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
 ; CHECKIZHINXMIN-LABEL: fnmadd_h:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, zero
-; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
+; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, zero
 ; CHECKIZHINXMIN-NEXT:    fadd.s a2, a2, zero
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECKIZHINXMIN-NEXT:    xor a0, a0, a3
 ; CHECKIZHINXMIN-NEXT:    xor a2, a2, a3
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
@@ -1646,17 +1646,17 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa1
 ; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa2
+; CHECKIZFHMIN-NEXT:    lui a0, 1048568
+; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECKIZFHMIN-NEXT:    fadd.s fa4, fa3, fa4
+; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa4, fa4
-; CHECKIZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECKIZFHMIN-NEXT:    lui a1, 1048568
-; CHECKIZFHMIN-NEXT:    xor a0, a0, a1
-; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a0
-; CHECKIZFHMIN-NEXT:    fmv.x.h a0, fa4
-; CHECKIZFHMIN-NEXT:    xor a0, a0, a1
+; CHECKIZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECKIZFHMIN-NEXT:    fmv.x.h a2, fa4
+; CHECKIZFHMIN-NEXT:    xor a1, a1, a0
+; CHECKIZFHMIN-NEXT:    xor a0, a2, a0
+; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a1
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa4, a0
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
@@ -1668,12 +1668,12 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
 ; CHECKIZHINXMIN-LABEL: fnmadd_h_2:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    fadd.s a1, a1, zero
-; CHECKIZHINXMIN-NEXT:    fcvt.h.s a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
+; CHECKIZHINXMIN-NEXT:    fadd.s a1, a1, zero
 ; CHECKIZHINXMIN-NEXT:    fadd.s a2, a2, zero
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a2, a2
-; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECKIZHINXMIN-NEXT:    xor a1, a1, a3
 ; CHECKIZHINXMIN-NEXT:    xor a2, a2, a3
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
@@ -2039,28 +2039,28 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
+; CHECKIZFHMIN-NEXT:    lui a0, 1048568
+; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa2
 ; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECKIZFHMIN-NEXT:    lui a1, 1048568
-; CHECKIZFHMIN-NEXT:    xor a0, a0, a1
+; CHECKIZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECKIZFHMIN-NEXT:    xor a0, a1, a0
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa2
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa1
-; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa5, fa3, fa4
+; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa1
+; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa5, fa4, fa3
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECKIZFHMIN-NEXT:    ret
 ;
 ; CHECKIZHINXMIN-LABEL: fnmsub_h:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
 ; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, zero
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECKIZHINXMIN-NEXT:    xor a0, a0, a3
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
@@ -2177,28 +2177,28 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa1
 ; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
+; CHECKIZFHMIN-NEXT:    lui a0, 1048568
+; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa2
 ; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECKIZFHMIN-NEXT:    lui a1, 1048568
-; CHECKIZFHMIN-NEXT:    xor a0, a0, a1
+; CHECKIZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECKIZFHMIN-NEXT:    xor a0, a1, a0
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa2
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa0
-; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa3, fa5, fa4
+; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa0
+; CHECKIZFHMIN-NEXT:    fmadd.s fa5, fa4, fa5, fa3
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
 ; CHECKIZFHMIN-NEXT:    ret
 ;
 ; CHECKIZHINXMIN-LABEL: fnmsub_h_2:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
 ; CHECKIZHINXMIN-NEXT:    fadd.s a1, a1, zero
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECKIZHINXMIN-NEXT:    lui a3, 1048568
 ; CHECKIZHINXMIN-NEXT:    xor a1, a1, a3
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECKIZHINXMIN-NEXT:    fmadd.s a0, a0, a1, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
@@ -2430,11 +2430,11 @@ define half @fmsub_h_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa2
 ; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
+; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa1
+; CHECKIZFHMIN-NEXT:    fcvt.s.h fa2, fa0
 ; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; CHECKIZFHMIN-NEXT:    fmul.s fa4, fa2, fa3
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa1
-; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa0
-; CHECKIZFHMIN-NEXT:    fmul.s fa4, fa3, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa4, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
@@ -2445,13 +2445,13 @@ define half @fmsub_h_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZHINXMIN-LABEL: fmsub_h_contract:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKIZHINXMIN-NEXT:    fadd.s a2, a2, zero
-; CHECKIZHINXMIN-NEXT:    fcvt.h.s a2, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fadd.s a2, a2, zero
 ; CHECKIZHINXMIN-NEXT:    fmul.s a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a1, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a2
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECKIZHINXMIN-NEXT:    fsub.s a0, a0, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
@@ -2606,21 +2606,21 @@ define half @fnmadd_h_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa1
-; CHECKIZFHMIN-NEXT:    fadd.s fa3, fa3, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa3, fa3
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa2, fa2
+; CHECKIZFHMIN-NEXT:    lui a0, 1048568
+; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
+; CHECKIZFHMIN-NEXT:    fadd.s fa3, fa3, fa4
 ; CHECKIZFHMIN-NEXT:    fadd.s fa4, fa2, fa4
+; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
+; CHECKIZFHMIN-NEXT:    fcvt.h.s fa3, fa3
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa4, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa3
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fmul.s fa5, fa5, fa3
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; CHECKIZFHMIN-NEXT:    fmv.x.h a0, fa5
-; CHECKIZFHMIN-NEXT:    lui a1, 1048568
-; CHECKIZFHMIN-NEXT:    xor a0, a0, a1
+; CHECKIZFHMIN-NEXT:    fmv.x.h a1, fa5
+; CHECKIZFHMIN-NEXT:    xor a0, a1, a0
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
@@ -2631,19 +2631,19 @@ define half @fnmadd_h_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZHINXMIN-LABEL: fnmadd_h_contract:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, zero
-; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
+; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, zero
 ; CHECKIZHINXMIN-NEXT:    fadd.s a1, a1, zero
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a1, a1
-; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a2
-; CHECKIZHINXMIN-NEXT:    fadd.s a2, a2, zero
-; CHECKIZHINXMIN-NEXT:    fcvt.h.s a2, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECKIZHINXMIN-NEXT:    fmul.s a0, a0, a1
-; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    lui a1, 1048568
+; CHECKIZHINXMIN-NEXT:    fadd.s a2, a2, zero
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a2, a2
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    xor a0, a0, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a2
@@ -2781,10 +2781,10 @@ define half @fnmsub_h_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; CHECKIZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
-; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa3, fa1
+; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECKIZFHMIN-NEXT:    fadd.s fa4, fa3, fa4
+; CHECKIZFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa4, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
@@ -2799,10 +2799,10 @@ define half @fnmsub_h_contract(half %a, half %b, half %c) nounwind {
 ; CHECKIZHINXMIN-LABEL: fnmsub_h_contract:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, zero
-; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
+; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, zero
 ; CHECKIZHINXMIN-NEXT:    fadd.s a1, a1, zero
+; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0

diff  --git a/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll b/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll
index e0c47bfac6fec8..730bde5af610b2 100644
--- a/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll
+++ b/llvm/test/CodeGen/RISCV/half-bitmanip-dagcombines.ll
@@ -165,8 +165,8 @@ define half @fcopysign_fneg(half %a, half %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    not a1, a1
 ; RV32I-NEXT:    lui a2, 1048568
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 17
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    srli a0, a0, 17
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -183,8 +183,8 @@ define half @fcopysign_fneg(half %a, half %b) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    not a1, a1
 ; RV64I-NEXT:    lui a2, 1048568
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 49
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    srli a0, a0, 49
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -211,8 +211,8 @@ define half @fcopysign_fneg(half %a, half %b) nounwind {
 ; RV32IZFHMIN:       # %bb.0:
 ; RV32IZFHMIN-NEXT:    not a1, a1
 ; RV32IZFHMIN-NEXT:    lui a2, 1048568
-; RV32IZFHMIN-NEXT:    and a1, a1, a2
 ; RV32IZFHMIN-NEXT:    slli a0, a0, 17
+; RV32IZFHMIN-NEXT:    and a1, a1, a2
 ; RV32IZFHMIN-NEXT:    srli a0, a0, 17
 ; RV32IZFHMIN-NEXT:    or a0, a0, a1
 ; RV32IZFHMIN-NEXT:    ret
@@ -221,8 +221,8 @@ define half @fcopysign_fneg(half %a, half %b) nounwind {
 ; RV64IZFHMIN:       # %bb.0:
 ; RV64IZFHMIN-NEXT:    not a1, a1
 ; RV64IZFHMIN-NEXT:    lui a2, 1048568
-; RV64IZFHMIN-NEXT:    and a1, a1, a2
 ; RV64IZFHMIN-NEXT:    slli a0, a0, 49
+; RV64IZFHMIN-NEXT:    and a1, a1, a2
 ; RV64IZFHMIN-NEXT:    srli a0, a0, 49
 ; RV64IZFHMIN-NEXT:    or a0, a0, a1
 ; RV64IZFHMIN-NEXT:    ret
@@ -233,8 +233,8 @@ define half @fcopysign_fneg(half %a, half %b) nounwind {
 ; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32IZHINXMIN-NEXT:    not a1, a1
 ; RV32IZHINXMIN-NEXT:    lui a2, 1048568
-; RV32IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
+; RV32IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    or a0, a0, a1
 ; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
@@ -246,8 +246,8 @@ define half @fcopysign_fneg(half %a, half %b) nounwind {
 ; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64IZHINXMIN-NEXT:    not a1, a1
 ; RV64IZHINXMIN-NEXT:    lui a2, 1048568
-; RV64IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
+; RV64IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    or a0, a0, a1
 ; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10

diff  --git a/llvm/test/CodeGen/RISCV/half-br-fcmp.ll b/llvm/test/CodeGen/RISCV/half-br-fcmp.ll
index 6699ee94793796..e9b142e33362fa 100644
--- a/llvm/test/CodeGen/RISCV/half-br-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/half-br-fcmp.ll
@@ -927,9 +927,9 @@ define void @br_fcmp_ord(half %a, half %b) nounwind {
 ; RV32IZFHMIN-LABEL: br_fcmp_ord:
 ; RV32IZFHMIN:       # %bb.0:
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
 ; RV32IZFHMIN-NEXT:    feq.s a0, fa5, fa5
-; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; RV32IZFHMIN-NEXT:    feq.s a1, fa5, fa5
+; RV32IZFHMIN-NEXT:    feq.s a1, fa4, fa4
 ; RV32IZFHMIN-NEXT:    and a0, a1, a0
 ; RV32IZFHMIN-NEXT:    bnez a0, .LBB8_2
 ; RV32IZFHMIN-NEXT:  # %bb.1: # %if.else
@@ -942,9 +942,9 @@ define void @br_fcmp_ord(half %a, half %b) nounwind {
 ; RV64IZFHMIN-LABEL: br_fcmp_ord:
 ; RV64IZFHMIN:       # %bb.0:
 ; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
 ; RV64IZFHMIN-NEXT:    feq.s a0, fa5, fa5
-; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; RV64IZFHMIN-NEXT:    feq.s a1, fa5, fa5
+; RV64IZFHMIN-NEXT:    feq.s a1, fa4, fa4
 ; RV64IZFHMIN-NEXT:    and a0, a1, a0
 ; RV64IZFHMIN-NEXT:    bnez a0, .LBB8_2
 ; RV64IZFHMIN-NEXT:  # %bb.1: # %if.else
@@ -957,8 +957,8 @@ define void @br_fcmp_ord(half %a, half %b) nounwind {
 ; RV32IZHINXMIN-LABEL: br_fcmp_ord:
 ; RV32IZHINXMIN:       # %bb.0:
 ; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; RV32IZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; RV32IZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; RV32IZHINXMIN-NEXT:    and a0, a0, a1
 ; RV32IZHINXMIN-NEXT:    bnez a0, .LBB8_2
@@ -972,8 +972,8 @@ define void @br_fcmp_ord(half %a, half %b) nounwind {
 ; RV64IZHINXMIN-LABEL: br_fcmp_ord:
 ; RV64IZHINXMIN:       # %bb.0:
 ; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; RV64IZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; RV64IZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; RV64IZHINXMIN-NEXT:    and a0, a0, a1
 ; RV64IZHINXMIN-NEXT:    bnez a0, .LBB8_2
@@ -1694,9 +1694,9 @@ define void @br_fcmp_uno(half %a, half %b) nounwind {
 ; RV32IZFHMIN-LABEL: br_fcmp_uno:
 ; RV32IZFHMIN:       # %bb.0:
 ; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
+; RV32IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
 ; RV32IZFHMIN-NEXT:    feq.s a0, fa5, fa5
-; RV32IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; RV32IZFHMIN-NEXT:    feq.s a1, fa5, fa5
+; RV32IZFHMIN-NEXT:    feq.s a1, fa4, fa4
 ; RV32IZFHMIN-NEXT:    and a0, a1, a0
 ; RV32IZFHMIN-NEXT:    beqz a0, .LBB15_2
 ; RV32IZFHMIN-NEXT:  # %bb.1: # %if.else
@@ -1709,9 +1709,9 @@ define void @br_fcmp_uno(half %a, half %b) nounwind {
 ; RV64IZFHMIN-LABEL: br_fcmp_uno:
 ; RV64IZFHMIN:       # %bb.0:
 ; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa1
+; RV64IZFHMIN-NEXT:    fcvt.s.h fa4, fa0
 ; RV64IZFHMIN-NEXT:    feq.s a0, fa5, fa5
-; RV64IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; RV64IZFHMIN-NEXT:    feq.s a1, fa5, fa5
+; RV64IZFHMIN-NEXT:    feq.s a1, fa4, fa4
 ; RV64IZFHMIN-NEXT:    and a0, a1, a0
 ; RV64IZFHMIN-NEXT:    beqz a0, .LBB15_2
 ; RV64IZFHMIN-NEXT:  # %bb.1: # %if.else
@@ -1724,8 +1724,8 @@ define void @br_fcmp_uno(half %a, half %b) nounwind {
 ; RV32IZHINXMIN-LABEL: br_fcmp_uno:
 ; RV32IZHINXMIN:       # %bb.0:
 ; RV32IZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; RV32IZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; RV32IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV32IZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; RV32IZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; RV32IZHINXMIN-NEXT:    and a0, a0, a1
 ; RV32IZHINXMIN-NEXT:    beqz a0, .LBB15_2
@@ -1739,8 +1739,8 @@ define void @br_fcmp_uno(half %a, half %b) nounwind {
 ; RV64IZHINXMIN-LABEL: br_fcmp_uno:
 ; RV64IZHINXMIN:       # %bb.0:
 ; RV64IZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; RV64IZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; RV64IZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; RV64IZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; RV64IZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; RV64IZHINXMIN-NEXT:    and a0, a0, a1
 ; RV64IZHINXMIN-NEXT:    beqz a0, .LBB15_2

diff  --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll
index 0c84a08f1fd451..01ffcab1a6556f 100644
--- a/llvm/test/CodeGen/RISCV/half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert.ll
@@ -194,13 +194,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV32IZFH-LABEL: fcvt_si_h_sat:
 ; RV32IZFH:       # %bb.0: # %start
 ; RV32IZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV32IZFH-NEXT:    feq.s a0, fa5, fa5
-; RV32IZFH-NEXT:    neg a0, a0
-; RV32IZFH-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV32IZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; RV32IZFH-NEXT:    lui a1, 815104
-; RV32IZFH-NEXT:    fmv.w.x fa3, a1
+; RV32IZFH-NEXT:    lui a0, %hi(.LCPI1_0)
+; RV32IZFH-NEXT:    feq.s a1, fa5, fa5
+; RV32IZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; RV32IZFH-NEXT:    lui a0, 815104
+; RV32IZFH-NEXT:    fmv.w.x fa3, a0
 ; RV32IZFH-NEXT:    fmax.s fa5, fa5, fa3
+; RV32IZFH-NEXT:    neg a0, a1
 ; RV32IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IZFH-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32IZFH-NEXT:    and a0, a0, a1
@@ -209,13 +209,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV64IZFH-LABEL: fcvt_si_h_sat:
 ; RV64IZFH:       # %bb.0: # %start
 ; RV64IZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV64IZFH-NEXT:    feq.s a0, fa5, fa5
-; RV64IZFH-NEXT:    neg a0, a0
-; RV64IZFH-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV64IZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; RV64IZFH-NEXT:    lui a1, 815104
-; RV64IZFH-NEXT:    fmv.w.x fa3, a1
+; RV64IZFH-NEXT:    lui a0, %hi(.LCPI1_0)
+; RV64IZFH-NEXT:    feq.s a1, fa5, fa5
+; RV64IZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; RV64IZFH-NEXT:    lui a0, 815104
+; RV64IZFH-NEXT:    fmv.w.x fa3, a0
 ; RV64IZFH-NEXT:    fmax.s fa5, fa5, fa3
+; RV64IZFH-NEXT:    neg a0, a1
 ; RV64IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IZFH-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64IZFH-NEXT:    and a0, a0, a1
@@ -224,13 +224,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV32IDZFH-LABEL: fcvt_si_h_sat:
 ; RV32IDZFH:       # %bb.0: # %start
 ; RV32IDZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV32IDZFH-NEXT:    feq.s a0, fa5, fa5
-; RV32IDZFH-NEXT:    neg a0, a0
-; RV32IDZFH-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV32IDZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; RV32IDZFH-NEXT:    lui a1, 815104
-; RV32IDZFH-NEXT:    fmv.w.x fa3, a1
+; RV32IDZFH-NEXT:    lui a0, %hi(.LCPI1_0)
+; RV32IDZFH-NEXT:    feq.s a1, fa5, fa5
+; RV32IDZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; RV32IDZFH-NEXT:    lui a0, 815104
+; RV32IDZFH-NEXT:    fmv.w.x fa3, a0
 ; RV32IDZFH-NEXT:    fmax.s fa5, fa5, fa3
+; RV32IDZFH-NEXT:    neg a0, a1
 ; RV32IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IDZFH-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32IDZFH-NEXT:    and a0, a0, a1
@@ -239,13 +239,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV64IDZFH-LABEL: fcvt_si_h_sat:
 ; RV64IDZFH:       # %bb.0: # %start
 ; RV64IDZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV64IDZFH-NEXT:    feq.s a0, fa5, fa5
-; RV64IDZFH-NEXT:    neg a0, a0
-; RV64IDZFH-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV64IDZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; RV64IDZFH-NEXT:    lui a1, 815104
-; RV64IDZFH-NEXT:    fmv.w.x fa3, a1
+; RV64IDZFH-NEXT:    lui a0, %hi(.LCPI1_0)
+; RV64IDZFH-NEXT:    feq.s a1, fa5, fa5
+; RV64IDZFH-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; RV64IDZFH-NEXT:    lui a0, 815104
+; RV64IDZFH-NEXT:    fmv.w.x fa3, a0
 ; RV64IDZFH-NEXT:    fmax.s fa5, fa5, fa3
+; RV64IDZFH-NEXT:    neg a0, a1
 ; RV64IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IDZFH-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64IDZFH-NEXT:    and a0, a0, a1
@@ -254,57 +254,57 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV32IZHINX-LABEL: fcvt_si_h_sat:
 ; RV32IZHINX:       # %bb.0: # %start
 ; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZHINX-NEXT:    feq.s a1, a0, a0
-; RV32IZHINX-NEXT:    neg a1, a1
-; RV32IZHINX-NEXT:    lui a2, 815104
-; RV32IZHINX-NEXT:    fmax.s a0, a0, a2
+; RV32IZHINX-NEXT:    lui a1, 815104
 ; RV32IZHINX-NEXT:    lui a2, 290816
+; RV32IZHINX-NEXT:    fmax.s a1, a0, a1
+; RV32IZHINX-NEXT:    feq.s a0, a0, a0
 ; RV32IZHINX-NEXT:    addi a2, a2, -512
-; RV32IZHINX-NEXT:    fmin.s a0, a0, a2
-; RV32IZHINX-NEXT:    fcvt.w.s a0, a0, rtz
-; RV32IZHINX-NEXT:    and a0, a1, a0
+; RV32IZHINX-NEXT:    neg a0, a0
+; RV32IZHINX-NEXT:    fmin.s a1, a1, a2
+; RV32IZHINX-NEXT:    fcvt.w.s a1, a1, rtz
+; RV32IZHINX-NEXT:    and a0, a0, a1
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: fcvt_si_h_sat:
 ; RV64IZHINX:       # %bb.0: # %start
 ; RV64IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZHINX-NEXT:    feq.s a1, a0, a0
-; RV64IZHINX-NEXT:    neg a1, a1
-; RV64IZHINX-NEXT:    lui a2, 815104
-; RV64IZHINX-NEXT:    fmax.s a0, a0, a2
+; RV64IZHINX-NEXT:    lui a1, 815104
 ; RV64IZHINX-NEXT:    lui a2, 290816
+; RV64IZHINX-NEXT:    fmax.s a1, a0, a1
+; RV64IZHINX-NEXT:    feq.s a0, a0, a0
 ; RV64IZHINX-NEXT:    addiw a2, a2, -512
-; RV64IZHINX-NEXT:    fmin.s a0, a0, a2
-; RV64IZHINX-NEXT:    fcvt.l.s a0, a0, rtz
-; RV64IZHINX-NEXT:    and a0, a1, a0
+; RV64IZHINX-NEXT:    neg a0, a0
+; RV64IZHINX-NEXT:    fmin.s a1, a1, a2
+; RV64IZHINX-NEXT:    fcvt.l.s a1, a1, rtz
+; RV64IZHINX-NEXT:    and a0, a0, a1
 ; RV64IZHINX-NEXT:    ret
 ;
 ; RV32IZDINXZHINX-LABEL: fcvt_si_h_sat:
 ; RV32IZDINXZHINX:       # %bb.0: # %start
 ; RV32IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZDINXZHINX-NEXT:    feq.s a1, a0, a0
-; RV32IZDINXZHINX-NEXT:    neg a1, a1
-; RV32IZDINXZHINX-NEXT:    lui a2, 815104
-; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, a2
+; RV32IZDINXZHINX-NEXT:    lui a1, 815104
 ; RV32IZDINXZHINX-NEXT:    lui a2, 290816
+; RV32IZDINXZHINX-NEXT:    fmax.s a1, a0, a1
+; RV32IZDINXZHINX-NEXT:    feq.s a0, a0, a0
 ; RV32IZDINXZHINX-NEXT:    addi a2, a2, -512
-; RV32IZDINXZHINX-NEXT:    fmin.s a0, a0, a2
-; RV32IZDINXZHINX-NEXT:    fcvt.w.s a0, a0, rtz
-; RV32IZDINXZHINX-NEXT:    and a0, a1, a0
+; RV32IZDINXZHINX-NEXT:    neg a0, a0
+; RV32IZDINXZHINX-NEXT:    fmin.s a1, a1, a2
+; RV32IZDINXZHINX-NEXT:    fcvt.w.s a1, a1, rtz
+; RV32IZDINXZHINX-NEXT:    and a0, a0, a1
 ; RV32IZDINXZHINX-NEXT:    ret
 ;
 ; RV64IZDINXZHINX-LABEL: fcvt_si_h_sat:
 ; RV64IZDINXZHINX:       # %bb.0: # %start
 ; RV64IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZDINXZHINX-NEXT:    feq.s a1, a0, a0
-; RV64IZDINXZHINX-NEXT:    neg a1, a1
-; RV64IZDINXZHINX-NEXT:    lui a2, 815104
-; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, a2
+; RV64IZDINXZHINX-NEXT:    lui a1, 815104
 ; RV64IZDINXZHINX-NEXT:    lui a2, 290816
+; RV64IZDINXZHINX-NEXT:    fmax.s a1, a0, a1
+; RV64IZDINXZHINX-NEXT:    feq.s a0, a0, a0
 ; RV64IZDINXZHINX-NEXT:    addiw a2, a2, -512
-; RV64IZDINXZHINX-NEXT:    fmin.s a0, a0, a2
-; RV64IZDINXZHINX-NEXT:    fcvt.l.s a0, a0, rtz
-; RV64IZDINXZHINX-NEXT:    and a0, a1, a0
+; RV64IZDINXZHINX-NEXT:    neg a0, a0
+; RV64IZDINXZHINX-NEXT:    fmin.s a1, a1, a2
+; RV64IZDINXZHINX-NEXT:    fcvt.l.s a1, a1, rtz
+; RV64IZDINXZHINX-NEXT:    and a0, a0, a1
 ; RV64IZDINXZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_si_h_sat:
@@ -399,13 +399,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    call __extendhfsf2
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a0
-; RV32ID-ILP32-NEXT:    feq.s a0, fa5, fa5
-; RV32ID-ILP32-NEXT:    neg a0, a0
-; RV32ID-ILP32-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV32ID-ILP32-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; RV32ID-ILP32-NEXT:    lui a1, 815104
-; RV32ID-ILP32-NEXT:    fmv.w.x fa3, a1
+; RV32ID-ILP32-NEXT:    lui a0, %hi(.LCPI1_0)
+; RV32ID-ILP32-NEXT:    feq.s a1, fa5, fa5
+; RV32ID-ILP32-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; RV32ID-ILP32-NEXT:    lui a0, 815104
+; RV32ID-ILP32-NEXT:    fmv.w.x fa3, a0
 ; RV32ID-ILP32-NEXT:    fmax.s fa5, fa5, fa3
+; RV32ID-ILP32-NEXT:    neg a0, a1
 ; RV32ID-ILP32-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-ILP32-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32ID-ILP32-NEXT:    and a0, a0, a1
@@ -419,13 +419,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV64ID-LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-LP64-NEXT:    call __extendhfsf2
 ; RV64ID-LP64-NEXT:    fmv.w.x fa5, a0
-; RV64ID-LP64-NEXT:    feq.s a0, fa5, fa5
-; RV64ID-LP64-NEXT:    neg a0, a0
-; RV64ID-LP64-NEXT:    lui a1, %hi(.LCPI1_0)
-; RV64ID-LP64-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; RV64ID-LP64-NEXT:    lui a1, 815104
-; RV64ID-LP64-NEXT:    fmv.w.x fa3, a1
+; RV64ID-LP64-NEXT:    lui a0, %hi(.LCPI1_0)
+; RV64ID-LP64-NEXT:    feq.s a1, fa5, fa5
+; RV64ID-LP64-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; RV64ID-LP64-NEXT:    lui a0, 815104
+; RV64ID-LP64-NEXT:    fmv.w.x fa3, a0
 ; RV64ID-LP64-NEXT:    fmax.s fa5, fa5, fa3
+; RV64ID-LP64-NEXT:    neg a0, a1
 ; RV64ID-LP64-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-LP64-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64ID-LP64-NEXT:    and a0, a0, a1
@@ -439,12 +439,12 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV32ID-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    call __extendhfsf2
 ; RV32ID-NEXT:    feq.s a0, fa0, fa0
-; RV32ID-NEXT:    neg a0, a0
 ; RV32ID-NEXT:    lui a1, %hi(.LCPI1_0)
 ; RV32ID-NEXT:    flw fa5, %lo(.LCPI1_0)(a1)
 ; RV32ID-NEXT:    lui a1, 815104
 ; RV32ID-NEXT:    fmv.w.x fa4, a1
 ; RV32ID-NEXT:    fmax.s fa4, fa0, fa4
+; RV32ID-NEXT:    neg a0, a0
 ; RV32ID-NEXT:    fmin.s fa5, fa4, fa5
 ; RV32ID-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32ID-NEXT:    and a0, a0, a1
@@ -458,12 +458,12 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; RV64ID-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    call __extendhfsf2
 ; RV64ID-NEXT:    feq.s a0, fa0, fa0
-; RV64ID-NEXT:    neg a0, a0
 ; RV64ID-NEXT:    lui a1, %hi(.LCPI1_0)
 ; RV64ID-NEXT:    flw fa5, %lo(.LCPI1_0)(a1)
 ; RV64ID-NEXT:    lui a1, 815104
 ; RV64ID-NEXT:    fmv.w.x fa4, a1
 ; RV64ID-NEXT:    fmax.s fa4, fa0, fa4
+; RV64ID-NEXT:    neg a0, a0
 ; RV64ID-NEXT:    fmin.s fa5, fa4, fa5
 ; RV64ID-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64ID-NEXT:    and a0, a0, a1
@@ -474,13 +474,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; CHECK32-IZFHMIN-LABEL: fcvt_si_h_sat:
 ; CHECK32-IZFHMIN:       # %bb.0: # %start
 ; CHECK32-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECK32-IZFHMIN-NEXT:    feq.s a0, fa5, fa5
-; CHECK32-IZFHMIN-NEXT:    neg a0, a0
-; CHECK32-IZFHMIN-NEXT:    lui a1, %hi(.LCPI1_0)
-; CHECK32-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; CHECK32-IZFHMIN-NEXT:    lui a1, 815104
-; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa3, a1
+; CHECK32-IZFHMIN-NEXT:    lui a0, %hi(.LCPI1_0)
+; CHECK32-IZFHMIN-NEXT:    feq.s a1, fa5, fa5
+; CHECK32-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; CHECK32-IZFHMIN-NEXT:    lui a0, 815104
+; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa3, a0
 ; CHECK32-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa3
+; CHECK32-IZFHMIN-NEXT:    neg a0, a1
 ; CHECK32-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32-IZFHMIN-NEXT:    fcvt.w.s a1, fa5, rtz
 ; CHECK32-IZFHMIN-NEXT:    and a0, a0, a1
@@ -489,13 +489,13 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; CHECK64-IZFHMIN-LABEL: fcvt_si_h_sat:
 ; CHECK64-IZFHMIN:       # %bb.0: # %start
 ; CHECK64-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECK64-IZFHMIN-NEXT:    feq.s a0, fa5, fa5
-; CHECK64-IZFHMIN-NEXT:    neg a0, a0
-; CHECK64-IZFHMIN-NEXT:    lui a1, %hi(.LCPI1_0)
-; CHECK64-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a1)
-; CHECK64-IZFHMIN-NEXT:    lui a1, 815104
-; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa3, a1
+; CHECK64-IZFHMIN-NEXT:    lui a0, %hi(.LCPI1_0)
+; CHECK64-IZFHMIN-NEXT:    feq.s a1, fa5, fa5
+; CHECK64-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI1_0)(a0)
+; CHECK64-IZFHMIN-NEXT:    lui a0, 815104
+; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa3, a0
 ; CHECK64-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa3
+; CHECK64-IZFHMIN-NEXT:    neg a0, a1
 ; CHECK64-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64-IZFHMIN-NEXT:    fcvt.l.s a1, fa5, rtz
 ; CHECK64-IZFHMIN-NEXT:    and a0, a0, a1
@@ -504,57 +504,57 @@ define i16 @fcvt_si_h_sat(half %a) nounwind {
 ; CHECK32-IZHINXMIN-LABEL: fcvt_si_h_sat:
 ; CHECK32-IZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK32-IZHINXMIN-NEXT:    neg a1, a1
-; CHECK32-IZHINXMIN-NEXT:    lui a2, 815104
-; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK32-IZHINXMIN-NEXT:    lui a1, 815104
 ; CHECK32-IZHINXMIN-NEXT:    lui a2, 290816
+; CHECK32-IZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK32-IZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECK32-IZHINXMIN-NEXT:    addi a2, a2, -512
-; CHECK32-IZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK32-IZHINXMIN-NEXT:    fcvt.w.s a0, a0, rtz
-; CHECK32-IZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK32-IZHINXMIN-NEXT:    neg a0, a0
+; CHECK32-IZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK32-IZHINXMIN-NEXT:    fcvt.w.s a1, a1, rtz
+; CHECK32-IZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK32-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZHINXMIN-LABEL: fcvt_si_h_sat:
 ; CHECK64-IZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK64-IZHINXMIN-NEXT:    neg a1, a1
-; CHECK64-IZHINXMIN-NEXT:    lui a2, 815104
-; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK64-IZHINXMIN-NEXT:    lui a1, 815104
 ; CHECK64-IZHINXMIN-NEXT:    lui a2, 290816
+; CHECK64-IZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK64-IZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECK64-IZHINXMIN-NEXT:    addiw a2, a2, -512
-; CHECK64-IZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK64-IZHINXMIN-NEXT:    fcvt.l.s a0, a0, rtz
-; CHECK64-IZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK64-IZHINXMIN-NEXT:    neg a0, a0
+; CHECK64-IZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK64-IZHINXMIN-NEXT:    fcvt.l.s a1, a1, rtz
+; CHECK64-IZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK64-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_si_h_sat:
 ; CHECK32-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    neg a1, a1
-; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, 815104
-; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, 815104
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, 290816
+; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK32-IZDINXZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    addi a2, a2, -512
-; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.w.s a0, a0, rtz
-; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK32-IZDINXZHINXMIN-NEXT:    neg a0, a0
+; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.w.s a1, a1, rtz
+; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK32-IZDINXZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_si_h_sat:
 ; CHECK64-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZDINXZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK64-IZDINXZHINXMIN-NEXT:    neg a1, a1
-; CHECK64-IZDINXZHINXMIN-NEXT:    lui a2, 815104
-; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK64-IZDINXZHINXMIN-NEXT:    lui a1, 815104
 ; CHECK64-IZDINXZHINXMIN-NEXT:    lui a2, 290816
+; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK64-IZDINXZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECK64-IZDINXZHINXMIN-NEXT:    addiw a2, a2, -512
-; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.l.s a0, a0, rtz
-; CHECK64-IZDINXZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK64-IZDINXZHINXMIN-NEXT:    neg a0, a0
+; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.l.s a1, a1, rtz
+; CHECK64-IZDINXZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK64-IZDINXZHINXMIN-NEXT:    ret
 start:
   %0 = tail call i16 @llvm.fptosi.sat.i16.f16(half %a)
@@ -756,8 +756,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; RV32IZHINX-LABEL: fcvt_ui_h_sat:
 ; RV32IZHINX:       # %bb.0: # %start
 ; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV32IZHINX-NEXT:    lui a1, 292864
+; RV32IZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV32IZHINX-NEXT:    addi a1, a1, -256
 ; RV32IZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV32IZHINX-NEXT:    fcvt.wu.s a0, a0, rtz
@@ -766,8 +766,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; RV64IZHINX-LABEL: fcvt_ui_h_sat:
 ; RV64IZHINX:       # %bb.0: # %start
 ; RV64IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV64IZHINX-NEXT:    lui a1, 292864
+; RV64IZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV64IZHINX-NEXT:    addiw a1, a1, -256
 ; RV64IZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV64IZHINX-NEXT:    fcvt.lu.s a0, a0, rtz
@@ -776,8 +776,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; RV32IZDINXZHINX-LABEL: fcvt_ui_h_sat:
 ; RV32IZDINXZHINX:       # %bb.0: # %start
 ; RV32IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV32IZDINXZHINX-NEXT:    lui a1, 292864
+; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV32IZDINXZHINX-NEXT:    addi a1, a1, -256
 ; RV32IZDINXZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV32IZDINXZHINX-NEXT:    fcvt.wu.s a0, a0, rtz
@@ -786,8 +786,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; RV64IZDINXZHINX-LABEL: fcvt_ui_h_sat:
 ; RV64IZDINXZHINX:       # %bb.0: # %start
 ; RV64IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV64IZDINXZHINX-NEXT:    lui a1, 292864
+; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV64IZDINXZHINX-NEXT:    addiw a1, a1, -256
 ; RV64IZDINXZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV64IZDINXZHINX-NEXT:    fcvt.lu.s a0, a0, rtz
@@ -956,8 +956,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; CHECK32-IZHINXMIN-LABEL: fcvt_ui_h_sat:
 ; CHECK32-IZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK32-IZHINXMIN-NEXT:    lui a1, 292864
+; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK32-IZHINXMIN-NEXT:    addi a1, a1, -256
 ; CHECK32-IZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.wu.s a0, a0, rtz
@@ -966,8 +966,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; CHECK64-IZHINXMIN-LABEL: fcvt_ui_h_sat:
 ; CHECK64-IZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK64-IZHINXMIN-NEXT:    lui a1, 292864
+; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK64-IZHINXMIN-NEXT:    addiw a1, a1, -256
 ; CHECK64-IZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.lu.s a0, a0, rtz
@@ -976,8 +976,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_ui_h_sat:
 ; CHECK32-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, 292864
+; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK32-IZDINXZHINXMIN-NEXT:    addi a1, a1, -256
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.wu.s a0, a0, rtz
@@ -986,8 +986,8 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind {
 ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_ui_h_sat:
 ; CHECK64-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK64-IZDINXZHINXMIN-NEXT:    lui a1, 292864
+; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK64-IZDINXZHINXMIN-NEXT:    addiw a1, a1, -256
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.lu.s a0, a0, rtz
@@ -2153,7 +2153,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a3, 524288
 ; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB10_2
 ; RV32IZFH-NEXT:  # %bb.1: # %start
@@ -2161,19 +2161,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IZFH-NEXT:  .LBB10_2: # %start
 ; RV32IZFH-NEXT:    lui a1, %hi(.LCPI10_0)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    beqz a3, .LBB10_4
+; RV32IZFH-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFH-NEXT:    beqz a1, .LBB10_4
 ; RV32IZFH-NEXT:  # %bb.3:
-; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:    addi a2, a3, -1
 ; RV32IZFH-NEXT:  .LBB10_4: # %start
-; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IZFH-NEXT:    feq.s a3, fs0, fs0
 ; RV32IZFH-NEXT:    neg a4, a1
-; RV32IZFH-NEXT:    and a1, a4, a2
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    neg a3, s0
+; RV32IZFH-NEXT:    neg a1, s0
+; RV32IZFH-NEXT:    neg a3, a3
+; RV32IZFH-NEXT:    and a0, a1, a0
+; RV32IZFH-NEXT:    and a1, a3, a2
+; RV32IZFH-NEXT:    or a0, a4, a0
 ; RV32IZFH-NEXT:    and a0, a3, a0
-; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    and a0, a4, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -2201,7 +2201,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IDZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IDZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IDZFH-NEXT:    call __fixsfdi
-; RV32IDZFH-NEXT:    lui a4, 524288
+; RV32IDZFH-NEXT:    lui a3, 524288
 ; RV32IDZFH-NEXT:    lui a2, 524288
 ; RV32IDZFH-NEXT:    beqz s0, .LBB10_2
 ; RV32IDZFH-NEXT:  # %bb.1: # %start
@@ -2209,19 +2209,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IDZFH-NEXT:  .LBB10_2: # %start
 ; RV32IDZFH-NEXT:    lui a1, %hi(.LCPI10_0)
 ; RV32IDZFH-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
-; RV32IDZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IDZFH-NEXT:    beqz a3, .LBB10_4
+; RV32IDZFH-NEXT:    flt.s a1, fa5, fs0
+; RV32IDZFH-NEXT:    beqz a1, .LBB10_4
 ; RV32IDZFH-NEXT:  # %bb.3:
-; RV32IDZFH-NEXT:    addi a2, a4, -1
+; RV32IDZFH-NEXT:    addi a2, a3, -1
 ; RV32IDZFH-NEXT:  .LBB10_4: # %start
-; RV32IDZFH-NEXT:    feq.s a1, fs0, fs0
+; RV32IDZFH-NEXT:    feq.s a3, fs0, fs0
 ; RV32IDZFH-NEXT:    neg a4, a1
-; RV32IDZFH-NEXT:    and a1, a4, a2
-; RV32IDZFH-NEXT:    neg a2, a3
-; RV32IDZFH-NEXT:    neg a3, s0
+; RV32IDZFH-NEXT:    neg a1, s0
+; RV32IDZFH-NEXT:    neg a3, a3
+; RV32IDZFH-NEXT:    and a0, a1, a0
+; RV32IDZFH-NEXT:    and a1, a3, a2
+; RV32IDZFH-NEXT:    or a0, a4, a0
 ; RV32IDZFH-NEXT:    and a0, a3, a0
-; RV32IDZFH-NEXT:    or a0, a2, a0
-; RV32IDZFH-NEXT:    and a0, a4, a0
 ; RV32IDZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IDZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IDZFH-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -2248,7 +2248,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IZHINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    lui a2, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB10_2
 ; RV32IZHINX-NEXT:  # %bb.1: # %start
@@ -2256,19 +2256,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IZHINX-NEXT:  .LBB10_2: # %start
 ; RV32IZHINX-NEXT:    lui a1, 389120
 ; RV32IZHINX-NEXT:    addi a1, a1, -1
-; RV32IZHINX-NEXT:    flt.s a3, a1, s0
-; RV32IZHINX-NEXT:    beqz a3, .LBB10_4
+; RV32IZHINX-NEXT:    flt.s a1, a1, s0
+; RV32IZHINX-NEXT:    beqz a1, .LBB10_4
 ; RV32IZHINX-NEXT:  # %bb.3:
-; RV32IZHINX-NEXT:    addi a2, a4, -1
+; RV32IZHINX-NEXT:    addi a2, a3, -1
 ; RV32IZHINX-NEXT:  .LBB10_4: # %start
-; RV32IZHINX-NEXT:    feq.s a1, s0, s0
+; RV32IZHINX-NEXT:    feq.s a3, s0, s0
 ; RV32IZHINX-NEXT:    neg a4, a1
-; RV32IZHINX-NEXT:    and a1, a4, a2
-; RV32IZHINX-NEXT:    neg a2, a3
-; RV32IZHINX-NEXT:    neg a3, s1
+; RV32IZHINX-NEXT:    neg a1, s1
+; RV32IZHINX-NEXT:    neg a3, a3
+; RV32IZHINX-NEXT:    and a0, a1, a0
+; RV32IZHINX-NEXT:    and a1, a3, a2
+; RV32IZHINX-NEXT:    or a0, a4, a0
 ; RV32IZHINX-NEXT:    and a0, a3, a0
-; RV32IZHINX-NEXT:    or a0, a2, a0
-; RV32IZHINX-NEXT:    and a0, a4, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -2295,7 +2295,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IZDINXZHINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZDINXZHINX-NEXT:    mv a0, s0
 ; RV32IZDINXZHINX-NEXT:    call __fixsfdi
-; RV32IZDINXZHINX-NEXT:    lui a4, 524288
+; RV32IZDINXZHINX-NEXT:    lui a3, 524288
 ; RV32IZDINXZHINX-NEXT:    lui a2, 524288
 ; RV32IZDINXZHINX-NEXT:    beqz s1, .LBB10_2
 ; RV32IZDINXZHINX-NEXT:  # %bb.1: # %start
@@ -2303,19 +2303,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IZDINXZHINX-NEXT:  .LBB10_2: # %start
 ; RV32IZDINXZHINX-NEXT:    lui a1, 389120
 ; RV32IZDINXZHINX-NEXT:    addi a1, a1, -1
-; RV32IZDINXZHINX-NEXT:    flt.s a3, a1, s0
-; RV32IZDINXZHINX-NEXT:    beqz a3, .LBB10_4
+; RV32IZDINXZHINX-NEXT:    flt.s a1, a1, s0
+; RV32IZDINXZHINX-NEXT:    beqz a1, .LBB10_4
 ; RV32IZDINXZHINX-NEXT:  # %bb.3:
-; RV32IZDINXZHINX-NEXT:    addi a2, a4, -1
+; RV32IZDINXZHINX-NEXT:    addi a2, a3, -1
 ; RV32IZDINXZHINX-NEXT:  .LBB10_4: # %start
-; RV32IZDINXZHINX-NEXT:    feq.s a1, s0, s0
+; RV32IZDINXZHINX-NEXT:    feq.s a3, s0, s0
 ; RV32IZDINXZHINX-NEXT:    neg a4, a1
-; RV32IZDINXZHINX-NEXT:    and a1, a4, a2
-; RV32IZDINXZHINX-NEXT:    neg a2, a3
-; RV32IZDINXZHINX-NEXT:    neg a3, s1
+; RV32IZDINXZHINX-NEXT:    neg a1, s1
+; RV32IZDINXZHINX-NEXT:    neg a3, a3
+; RV32IZDINXZHINX-NEXT:    and a0, a1, a0
+; RV32IZDINXZHINX-NEXT:    and a1, a3, a2
+; RV32IZDINXZHINX-NEXT:    or a0, a4, a0
 ; RV32IZDINXZHINX-NEXT:    and a0, a3, a0
-; RV32IZDINXZHINX-NEXT:    or a0, a2, a0
-; RV32IZDINXZHINX-NEXT:    and a0, a4, a0
 ; RV32IZDINXZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZDINXZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZDINXZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -2342,13 +2342,13 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s5, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    mv s2, a0
 ; RV32I-NEXT:    lui a1, 913408
 ; RV32I-NEXT:    call __gesf2
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __fixsfdi
-; RV32I-NEXT:    mv s2, a0
+; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    mv s3, a1
 ; RV32I-NEXT:    lui s5, 524288
 ; RV32I-NEXT:    bgez s0, .LBB10_2
@@ -2357,25 +2357,25 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32I-NEXT:  .LBB10_2: # %start
 ; RV32I-NEXT:    lui a1, 389120
 ; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a0, s2
 ; RV32I-NEXT:    call __gtsf2
 ; RV32I-NEXT:    mv s4, a0
 ; RV32I-NEXT:    blez a0, .LBB10_4
 ; RV32I-NEXT:  # %bb.3: # %start
 ; RV32I-NEXT:    addi s3, s5, -1
 ; RV32I-NEXT:  .LBB10_4: # %start
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a1, s2
 ; RV32I-NEXT:    call __unordsf2
 ; RV32I-NEXT:    snez a0, a0
+; RV32I-NEXT:    sgtz a1, s4
+; RV32I-NEXT:    slti a2, s0, 0
 ; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    neg a3, a1
+; RV32I-NEXT:    addi a2, a2, -1
 ; RV32I-NEXT:    and a1, a0, s3
-; RV32I-NEXT:    sgtz a2, s4
-; RV32I-NEXT:    neg a2, a2
-; RV32I-NEXT:    slti a3, s0, 0
-; RV32I-NEXT:    addi a3, a3, -1
-; RV32I-NEXT:    and a3, a3, s2
-; RV32I-NEXT:    or a2, a2, a3
+; RV32I-NEXT:    and a2, a2, s1
+; RV32I-NEXT:    or a2, a3, a2
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -2444,7 +2444,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    fsw fa4, 4(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    fle.s s0, fa5, fa4
 ; RV32ID-ILP32-NEXT:    call __fixsfdi
-; RV32ID-ILP32-NEXT:    lui a4, 524288
+; RV32ID-ILP32-NEXT:    lui a3, 524288
 ; RV32ID-ILP32-NEXT:    lui a2, 524288
 ; RV32ID-ILP32-NEXT:    beqz s0, .LBB10_2
 ; RV32ID-ILP32-NEXT:  # %bb.1: # %start
@@ -2453,20 +2453,20 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    lui a1, %hi(.LCPI10_0)
 ; RV32ID-ILP32-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
 ; RV32ID-ILP32-NEXT:    flw fa4, 4(sp) # 4-byte Folded Reload
-; RV32ID-ILP32-NEXT:    flt.s a3, fa5, fa4
+; RV32ID-ILP32-NEXT:    flt.s a1, fa5, fa4
 ; RV32ID-ILP32-NEXT:    fmv.s fa5, fa4
-; RV32ID-ILP32-NEXT:    beqz a3, .LBB10_4
+; RV32ID-ILP32-NEXT:    beqz a1, .LBB10_4
 ; RV32ID-ILP32-NEXT:  # %bb.3:
-; RV32ID-ILP32-NEXT:    addi a2, a4, -1
+; RV32ID-ILP32-NEXT:    addi a2, a3, -1
 ; RV32ID-ILP32-NEXT:  .LBB10_4: # %start
-; RV32ID-ILP32-NEXT:    feq.s a1, fa5, fa5
+; RV32ID-ILP32-NEXT:    feq.s a3, fa5, fa5
 ; RV32ID-ILP32-NEXT:    neg a4, a1
-; RV32ID-ILP32-NEXT:    and a1, a4, a2
-; RV32ID-ILP32-NEXT:    neg a2, a3
-; RV32ID-ILP32-NEXT:    neg a3, s0
+; RV32ID-ILP32-NEXT:    neg a1, s0
+; RV32ID-ILP32-NEXT:    neg a3, a3
+; RV32ID-ILP32-NEXT:    and a0, a1, a0
+; RV32ID-ILP32-NEXT:    and a1, a3, a2
+; RV32ID-ILP32-NEXT:    or a0, a4, a0
 ; RV32ID-ILP32-NEXT:    and a0, a3, a0
-; RV32ID-ILP32-NEXT:    or a0, a2, a0
-; RV32ID-ILP32-NEXT:    and a0, a4, a0
 ; RV32ID-ILP32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32ID-ILP32-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32ID-ILP32-NEXT:    addi sp, sp, 16
@@ -2499,7 +2499,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32ID-NEXT:    fmv.w.x fa5, a0
 ; RV32ID-NEXT:    fle.s s0, fa5, fa0
 ; RV32ID-NEXT:    call __fixsfdi
-; RV32ID-NEXT:    lui a4, 524288
+; RV32ID-NEXT:    lui a3, 524288
 ; RV32ID-NEXT:    lui a2, 524288
 ; RV32ID-NEXT:    beqz s0, .LBB10_2
 ; RV32ID-NEXT:  # %bb.1: # %start
@@ -2507,19 +2507,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32ID-NEXT:  .LBB10_2: # %start
 ; RV32ID-NEXT:    lui a1, %hi(.LCPI10_0)
 ; RV32ID-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
-; RV32ID-NEXT:    flt.s a3, fa5, fs0
-; RV32ID-NEXT:    beqz a3, .LBB10_4
+; RV32ID-NEXT:    flt.s a1, fa5, fs0
+; RV32ID-NEXT:    beqz a1, .LBB10_4
 ; RV32ID-NEXT:  # %bb.3:
-; RV32ID-NEXT:    addi a2, a4, -1
+; RV32ID-NEXT:    addi a2, a3, -1
 ; RV32ID-NEXT:  .LBB10_4: # %start
-; RV32ID-NEXT:    feq.s a1, fs0, fs0
-; RV32ID-NEXT:    neg a4, a1
-; RV32ID-NEXT:    and a1, a4, a2
-; RV32ID-NEXT:    neg a2, s0
-; RV32ID-NEXT:    and a0, a2, a0
-; RV32ID-NEXT:    neg a2, a3
-; RV32ID-NEXT:    or a0, a2, a0
+; RV32ID-NEXT:    feq.s a3, fs0, fs0
+; RV32ID-NEXT:    neg a4, s0
+; RV32ID-NEXT:    neg a5, a1
+; RV32ID-NEXT:    neg a3, a3
 ; RV32ID-NEXT:    and a0, a4, a0
+; RV32ID-NEXT:    and a1, a3, a2
+; RV32ID-NEXT:    or a0, a5, a0
+; RV32ID-NEXT:    and a0, a3, a0
 ; RV32ID-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -2552,7 +2552,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IFZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IFZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IFZFHMIN-NEXT:    call __fixsfdi
-; RV32IFZFHMIN-NEXT:    lui a4, 524288
+; RV32IFZFHMIN-NEXT:    lui a3, 524288
 ; RV32IFZFHMIN-NEXT:    lui a2, 524288
 ; RV32IFZFHMIN-NEXT:    beqz s0, .LBB10_2
 ; RV32IFZFHMIN-NEXT:  # %bb.1: # %start
@@ -2560,19 +2560,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IFZFHMIN-NEXT:  .LBB10_2: # %start
 ; RV32IFZFHMIN-NEXT:    lui a1, %hi(.LCPI10_0)
 ; RV32IFZFHMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
-; RV32IFZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IFZFHMIN-NEXT:    beqz a3, .LBB10_4
+; RV32IFZFHMIN-NEXT:    flt.s a1, fa5, fs0
+; RV32IFZFHMIN-NEXT:    beqz a1, .LBB10_4
 ; RV32IFZFHMIN-NEXT:  # %bb.3:
-; RV32IFZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IFZFHMIN-NEXT:    addi a2, a3, -1
 ; RV32IFZFHMIN-NEXT:  .LBB10_4: # %start
-; RV32IFZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IFZFHMIN-NEXT:    feq.s a3, fs0, fs0
 ; RV32IFZFHMIN-NEXT:    neg a4, a1
-; RV32IFZFHMIN-NEXT:    and a1, a4, a2
-; RV32IFZFHMIN-NEXT:    neg a2, a3
-; RV32IFZFHMIN-NEXT:    neg a3, s0
+; RV32IFZFHMIN-NEXT:    neg a1, s0
+; RV32IFZFHMIN-NEXT:    neg a3, a3
+; RV32IFZFHMIN-NEXT:    and a0, a1, a0
+; RV32IFZFHMIN-NEXT:    and a1, a3, a2
+; RV32IFZFHMIN-NEXT:    or a0, a4, a0
 ; RV32IFZFHMIN-NEXT:    and a0, a3, a0
-; RV32IFZFHMIN-NEXT:    or a0, a2, a0
-; RV32IFZFHMIN-NEXT:    and a0, a4, a0
 ; RV32IFZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IFZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IFZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -2601,7 +2601,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IDZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IDZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IDZFHMIN-NEXT:    call __fixsfdi
-; RV32IDZFHMIN-NEXT:    lui a4, 524288
+; RV32IDZFHMIN-NEXT:    lui a3, 524288
 ; RV32IDZFHMIN-NEXT:    lui a2, 524288
 ; RV32IDZFHMIN-NEXT:    beqz s0, .LBB10_2
 ; RV32IDZFHMIN-NEXT:  # %bb.1: # %start
@@ -2609,19 +2609,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; RV32IDZFHMIN-NEXT:  .LBB10_2: # %start
 ; RV32IDZFHMIN-NEXT:    lui a1, %hi(.LCPI10_0)
 ; RV32IDZFHMIN-NEXT:    flw fa5, %lo(.LCPI10_0)(a1)
-; RV32IDZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IDZFHMIN-NEXT:    beqz a3, .LBB10_4
+; RV32IDZFHMIN-NEXT:    flt.s a1, fa5, fs0
+; RV32IDZFHMIN-NEXT:    beqz a1, .LBB10_4
 ; RV32IDZFHMIN-NEXT:  # %bb.3:
-; RV32IDZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IDZFHMIN-NEXT:    addi a2, a3, -1
 ; RV32IDZFHMIN-NEXT:  .LBB10_4: # %start
-; RV32IDZFHMIN-NEXT:    feq.s a1, fs0, fs0
+; RV32IDZFHMIN-NEXT:    feq.s a3, fs0, fs0
 ; RV32IDZFHMIN-NEXT:    neg a4, a1
-; RV32IDZFHMIN-NEXT:    and a1, a4, a2
-; RV32IDZFHMIN-NEXT:    neg a2, a3
-; RV32IDZFHMIN-NEXT:    neg a3, s0
+; RV32IDZFHMIN-NEXT:    neg a1, s0
+; RV32IDZFHMIN-NEXT:    neg a3, a3
+; RV32IDZFHMIN-NEXT:    and a0, a1, a0
+; RV32IDZFHMIN-NEXT:    and a1, a3, a2
+; RV32IDZFHMIN-NEXT:    or a0, a4, a0
 ; RV32IDZFHMIN-NEXT:    and a0, a3, a0
-; RV32IDZFHMIN-NEXT:    or a0, a2, a0
-; RV32IDZFHMIN-NEXT:    and a0, a4, a0
 ; RV32IDZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IDZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IDZFHMIN-NEXT:    fld fs0, 0(sp) # 8-byte Folded Reload
@@ -2639,7 +2639,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; CHECK32-IZHINXMIN-NEXT:    fle.s s1, a0, s0
 ; CHECK32-IZHINXMIN-NEXT:    mv a0, s0
 ; CHECK32-IZHINXMIN-NEXT:    call __fixsfdi
-; CHECK32-IZHINXMIN-NEXT:    lui a4, 524288
+; CHECK32-IZHINXMIN-NEXT:    lui a3, 524288
 ; CHECK32-IZHINXMIN-NEXT:    lui a2, 524288
 ; CHECK32-IZHINXMIN-NEXT:    beqz s1, .LBB10_2
 ; CHECK32-IZHINXMIN-NEXT:  # %bb.1: # %start
@@ -2647,19 +2647,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; CHECK32-IZHINXMIN-NEXT:  .LBB10_2: # %start
 ; CHECK32-IZHINXMIN-NEXT:    lui a1, 389120
 ; CHECK32-IZHINXMIN-NEXT:    addi a1, a1, -1
-; CHECK32-IZHINXMIN-NEXT:    flt.s a3, a1, s0
-; CHECK32-IZHINXMIN-NEXT:    beqz a3, .LBB10_4
+; CHECK32-IZHINXMIN-NEXT:    flt.s a1, a1, s0
+; CHECK32-IZHINXMIN-NEXT:    beqz a1, .LBB10_4
 ; CHECK32-IZHINXMIN-NEXT:  # %bb.3:
-; CHECK32-IZHINXMIN-NEXT:    addi a2, a4, -1
+; CHECK32-IZHINXMIN-NEXT:    addi a2, a3, -1
 ; CHECK32-IZHINXMIN-NEXT:  .LBB10_4: # %start
-; CHECK32-IZHINXMIN-NEXT:    feq.s a1, s0, s0
+; CHECK32-IZHINXMIN-NEXT:    feq.s a3, s0, s0
 ; CHECK32-IZHINXMIN-NEXT:    neg a4, a1
-; CHECK32-IZHINXMIN-NEXT:    and a1, a4, a2
-; CHECK32-IZHINXMIN-NEXT:    neg a2, a3
-; CHECK32-IZHINXMIN-NEXT:    neg a3, s1
+; CHECK32-IZHINXMIN-NEXT:    neg a1, s1
+; CHECK32-IZHINXMIN-NEXT:    neg a3, a3
+; CHECK32-IZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK32-IZHINXMIN-NEXT:    and a1, a3, a2
+; CHECK32-IZHINXMIN-NEXT:    or a0, a4, a0
 ; CHECK32-IZHINXMIN-NEXT:    and a0, a3, a0
-; CHECK32-IZHINXMIN-NEXT:    or a0, a2, a0
-; CHECK32-IZHINXMIN-NEXT:    and a0, a4, a0
 ; CHECK32-IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK32-IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; CHECK32-IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -2687,7 +2687,7 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fle.s s1, a0, s0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    mv a0, s0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    call __fixsfdi
-; CHECK32-IZDINXZHINXMIN-NEXT:    lui a4, 524288
+; CHECK32-IZDINXZHINXMIN-NEXT:    lui a3, 524288
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, 524288
 ; CHECK32-IZDINXZHINXMIN-NEXT:    beqz s1, .LBB10_2
 ; CHECK32-IZDINXZHINXMIN-NEXT:  # %bb.1: # %start
@@ -2695,19 +2695,19 @@ define i64 @fcvt_l_h_sat(half %a) nounwind {
 ; CHECK32-IZDINXZHINXMIN-NEXT:  .LBB10_2: # %start
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, 389120
 ; CHECK32-IZDINXZHINXMIN-NEXT:    addi a1, a1, -1
-; CHECK32-IZDINXZHINXMIN-NEXT:    flt.s a3, a1, s0
-; CHECK32-IZDINXZHINXMIN-NEXT:    beqz a3, .LBB10_4
+; CHECK32-IZDINXZHINXMIN-NEXT:    flt.s a1, a1, s0
+; CHECK32-IZDINXZHINXMIN-NEXT:    beqz a1, .LBB10_4
 ; CHECK32-IZDINXZHINXMIN-NEXT:  # %bb.3:
-; CHECK32-IZDINXZHINXMIN-NEXT:    addi a2, a4, -1
+; CHECK32-IZDINXZHINXMIN-NEXT:    addi a2, a3, -1
 ; CHECK32-IZDINXZHINXMIN-NEXT:  .LBB10_4: # %start
-; CHECK32-IZDINXZHINXMIN-NEXT:    feq.s a1, s0, s0
+; CHECK32-IZDINXZHINXMIN-NEXT:    feq.s a3, s0, s0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    neg a4, a1
-; CHECK32-IZDINXZHINXMIN-NEXT:    and a1, a4, a2
-; CHECK32-IZDINXZHINXMIN-NEXT:    neg a2, a3
-; CHECK32-IZDINXZHINXMIN-NEXT:    neg a3, s1
+; CHECK32-IZDINXZHINXMIN-NEXT:    neg a1, s1
+; CHECK32-IZDINXZHINXMIN-NEXT:    neg a3, a3
+; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK32-IZDINXZHINXMIN-NEXT:    and a1, a3, a2
+; CHECK32-IZDINXZHINXMIN-NEXT:    or a0, a4, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a3, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    or a0, a2, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a4, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -2907,15 +2907,15 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32IZFH-NEXT:    lui a0, %hi(.LCPI12_0)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI12_0)(a0)
 ; RV32IZFH-NEXT:    fcvt.s.h fa0, fa0
-; RV32IZFH-NEXT:    flt.s a0, fa5, fa0
-; RV32IZFH-NEXT:    neg s0, a0
-; RV32IZFH-NEXT:    fmv.w.x fa5, zero
-; RV32IZFH-NEXT:    fle.s a0, fa5, fa0
+; RV32IZFH-NEXT:    fmv.w.x fa4, zero
+; RV32IZFH-NEXT:    fle.s a0, fa4, fa0
+; RV32IZFH-NEXT:    flt.s a1, fa5, fa0
+; RV32IZFH-NEXT:    neg s0, a1
 ; RV32IZFH-NEXT:    neg s1, a0
 ; RV32IZFH-NEXT:    call __fixunssfdi
 ; RV32IZFH-NEXT:    and a0, s1, a0
-; RV32IZFH-NEXT:    or a0, s0, a0
 ; RV32IZFH-NEXT:    and a1, s1, a1
+; RV32IZFH-NEXT:    or a0, s0, a0
 ; RV32IZFH-NEXT:    or a1, s0, a1
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -2941,15 +2941,15 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32IDZFH-NEXT:    lui a0, %hi(.LCPI12_0)
 ; RV32IDZFH-NEXT:    flw fa5, %lo(.LCPI12_0)(a0)
 ; RV32IDZFH-NEXT:    fcvt.s.h fa0, fa0
-; RV32IDZFH-NEXT:    flt.s a0, fa5, fa0
-; RV32IDZFH-NEXT:    neg s0, a0
-; RV32IDZFH-NEXT:    fmv.w.x fa5, zero
-; RV32IDZFH-NEXT:    fle.s a0, fa5, fa0
+; RV32IDZFH-NEXT:    fmv.w.x fa4, zero
+; RV32IDZFH-NEXT:    fle.s a0, fa4, fa0
+; RV32IDZFH-NEXT:    flt.s a1, fa5, fa0
+; RV32IDZFH-NEXT:    neg s0, a1
 ; RV32IDZFH-NEXT:    neg s1, a0
 ; RV32IDZFH-NEXT:    call __fixunssfdi
 ; RV32IDZFH-NEXT:    and a0, s1, a0
-; RV32IDZFH-NEXT:    or a0, s0, a0
 ; RV32IDZFH-NEXT:    and a1, s1, a1
+; RV32IDZFH-NEXT:    or a0, s0, a0
 ; RV32IDZFH-NEXT:    or a1, s0, a1
 ; RV32IDZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IDZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -2975,14 +2975,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV32IZHINX-NEXT:    lui a1, 391168
 ; RV32IZHINX-NEXT:    addi a1, a1, -1
+; RV32IZHINX-NEXT:    fle.s a2, zero, a0
 ; RV32IZHINX-NEXT:    flt.s a1, a1, a0
 ; RV32IZHINX-NEXT:    neg s0, a1
-; RV32IZHINX-NEXT:    fle.s a1, zero, a0
-; RV32IZHINX-NEXT:    neg s1, a1
+; RV32IZHINX-NEXT:    neg s1, a2
 ; RV32IZHINX-NEXT:    call __fixunssfdi
 ; RV32IZHINX-NEXT:    and a0, s1, a0
-; RV32IZHINX-NEXT:    or a0, s0, a0
 ; RV32IZHINX-NEXT:    and a1, s1, a1
+; RV32IZHINX-NEXT:    or a0, s0, a0
 ; RV32IZHINX-NEXT:    or a1, s0, a1
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3008,14 +3008,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV32IZDINXZHINX-NEXT:    lui a1, 391168
 ; RV32IZDINXZHINX-NEXT:    addi a1, a1, -1
+; RV32IZDINXZHINX-NEXT:    fle.s a2, zero, a0
 ; RV32IZDINXZHINX-NEXT:    flt.s a1, a1, a0
 ; RV32IZDINXZHINX-NEXT:    neg s0, a1
-; RV32IZDINXZHINX-NEXT:    fle.s a1, zero, a0
-; RV32IZDINXZHINX-NEXT:    neg s1, a1
+; RV32IZDINXZHINX-NEXT:    neg s1, a2
 ; RV32IZDINXZHINX-NEXT:    call __fixunssfdi
 ; RV32IZDINXZHINX-NEXT:    and a0, s1, a0
-; RV32IZDINXZHINX-NEXT:    or a0, s0, a0
 ; RV32IZDINXZHINX-NEXT:    and a1, s1, a1
+; RV32IZDINXZHINX-NEXT:    or a0, s0, a0
 ; RV32IZDINXZHINX-NEXT:    or a1, s0, a1
 ; RV32IZDINXZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZDINXZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3054,8 +3054,8 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    call __fixunssfdi
 ; RV32I-NEXT:    and a0, s2, a0
-; RV32I-NEXT:    or a0, s1, a0
 ; RV32I-NEXT:    and a1, s2, a1
+; RV32I-NEXT:    or a0, s1, a0
 ; RV32I-NEXT:    or a1, s1, a1
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3106,15 +3106,15 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    lui a1, %hi(.LCPI12_0)
 ; RV32ID-ILP32-NEXT:    flw fa5, %lo(.LCPI12_0)(a1)
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a0
-; RV32ID-ILP32-NEXT:    flt.s a1, fa5, fa4
-; RV32ID-ILP32-NEXT:    neg s0, a1
-; RV32ID-ILP32-NEXT:    fmv.w.x fa5, zero
-; RV32ID-ILP32-NEXT:    fle.s a1, fa5, fa4
+; RV32ID-ILP32-NEXT:    fmv.w.x fa3, zero
+; RV32ID-ILP32-NEXT:    fle.s a1, fa3, fa4
+; RV32ID-ILP32-NEXT:    flt.s a2, fa5, fa4
+; RV32ID-ILP32-NEXT:    neg s0, a2
 ; RV32ID-ILP32-NEXT:    neg s1, a1
 ; RV32ID-ILP32-NEXT:    call __fixunssfdi
 ; RV32ID-ILP32-NEXT:    and a0, s1, a0
-; RV32ID-ILP32-NEXT:    or a0, s0, a0
 ; RV32ID-ILP32-NEXT:    and a1, s1, a1
+; RV32ID-ILP32-NEXT:    or a0, s0, a0
 ; RV32ID-ILP32-NEXT:    or a1, s0, a1
 ; RV32ID-ILP32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32ID-ILP32-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3146,15 +3146,15 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; RV32ID-NEXT:    call __extendhfsf2
 ; RV32ID-NEXT:    lui a0, %hi(.LCPI12_0)
 ; RV32ID-NEXT:    flw fa5, %lo(.LCPI12_0)(a0)
-; RV32ID-NEXT:    flt.s a0, fa5, fa0
-; RV32ID-NEXT:    neg s0, a0
-; RV32ID-NEXT:    fmv.w.x fa5, zero
-; RV32ID-NEXT:    fle.s a0, fa5, fa0
+; RV32ID-NEXT:    fmv.w.x fa4, zero
+; RV32ID-NEXT:    fle.s a0, fa4, fa0
+; RV32ID-NEXT:    flt.s a1, fa5, fa0
+; RV32ID-NEXT:    neg s0, a1
 ; RV32ID-NEXT:    neg s1, a0
 ; RV32ID-NEXT:    call __fixunssfdi
 ; RV32ID-NEXT:    and a0, s1, a0
-; RV32ID-NEXT:    or a0, s0, a0
 ; RV32ID-NEXT:    and a1, s1, a1
+; RV32ID-NEXT:    or a0, s0, a0
 ; RV32ID-NEXT:    or a1, s0, a1
 ; RV32ID-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3185,15 +3185,15 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; CHECK32-IZFHMIN-NEXT:    lui a0, %hi(.LCPI12_0)
 ; CHECK32-IZFHMIN-NEXT:    flw fa5, %lo(.LCPI12_0)(a0)
 ; CHECK32-IZFHMIN-NEXT:    fcvt.s.h fa0, fa0
-; CHECK32-IZFHMIN-NEXT:    flt.s a0, fa5, fa0
-; CHECK32-IZFHMIN-NEXT:    neg s0, a0
-; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa5, zero
-; CHECK32-IZFHMIN-NEXT:    fle.s a0, fa5, fa0
+; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa4, zero
+; CHECK32-IZFHMIN-NEXT:    fle.s a0, fa4, fa0
+; CHECK32-IZFHMIN-NEXT:    flt.s a1, fa5, fa0
+; CHECK32-IZFHMIN-NEXT:    neg s0, a1
 ; CHECK32-IZFHMIN-NEXT:    neg s1, a0
 ; CHECK32-IZFHMIN-NEXT:    call __fixunssfdi
 ; CHECK32-IZFHMIN-NEXT:    and a0, s1, a0
-; CHECK32-IZFHMIN-NEXT:    or a0, s0, a0
 ; CHECK32-IZFHMIN-NEXT:    and a1, s1, a1
+; CHECK32-IZFHMIN-NEXT:    or a0, s0, a0
 ; CHECK32-IZFHMIN-NEXT:    or a1, s0, a1
 ; CHECK32-IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK32-IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3220,14 +3220,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK32-IZHINXMIN-NEXT:    lui a1, 391168
 ; CHECK32-IZHINXMIN-NEXT:    addi a1, a1, -1
+; CHECK32-IZHINXMIN-NEXT:    fle.s a2, zero, a0
 ; CHECK32-IZHINXMIN-NEXT:    flt.s a1, a1, a0
 ; CHECK32-IZHINXMIN-NEXT:    neg s0, a1
-; CHECK32-IZHINXMIN-NEXT:    fle.s a1, zero, a0
-; CHECK32-IZHINXMIN-NEXT:    neg s1, a1
+; CHECK32-IZHINXMIN-NEXT:    neg s1, a2
 ; CHECK32-IZHINXMIN-NEXT:    call __fixunssfdi
 ; CHECK32-IZHINXMIN-NEXT:    and a0, s1, a0
-; CHECK32-IZHINXMIN-NEXT:    or a0, s0, a0
 ; CHECK32-IZHINXMIN-NEXT:    and a1, s1, a1
+; CHECK32-IZHINXMIN-NEXT:    or a0, s0, a0
 ; CHECK32-IZHINXMIN-NEXT:    or a1, s0, a1
 ; CHECK32-IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK32-IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3254,14 +3254,14 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind {
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, 391168
 ; CHECK32-IZDINXZHINXMIN-NEXT:    addi a1, a1, -1
+; CHECK32-IZDINXZHINXMIN-NEXT:    fle.s a2, zero, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    flt.s a1, a1, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    neg s0, a1
-; CHECK32-IZDINXZHINXMIN-NEXT:    fle.s a1, zero, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    neg s1, a1
+; CHECK32-IZDINXZHINXMIN-NEXT:    neg s1, a2
 ; CHECK32-IZDINXZHINXMIN-NEXT:    call __fixunssfdi
 ; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, s1, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    or a0, s0, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    and a1, s1, a1
+; CHECK32-IZDINXZHINXMIN-NEXT:    or a0, s0, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    or a1, s0, a1
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -6296,13 +6296,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV32IZFH-LABEL: fcvt_w_s_sat_i16:
 ; RV32IZFH:       # %bb.0: # %start
 ; RV32IZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV32IZFH-NEXT:    feq.s a0, fa5, fa5
-; RV32IZFH-NEXT:    neg a0, a0
-; RV32IZFH-NEXT:    lui a1, %hi(.LCPI32_0)
-; RV32IZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
-; RV32IZFH-NEXT:    lui a1, 815104
-; RV32IZFH-NEXT:    fmv.w.x fa3, a1
+; RV32IZFH-NEXT:    lui a0, %hi(.LCPI32_0)
+; RV32IZFH-NEXT:    feq.s a1, fa5, fa5
+; RV32IZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
+; RV32IZFH-NEXT:    lui a0, 815104
+; RV32IZFH-NEXT:    fmv.w.x fa3, a0
 ; RV32IZFH-NEXT:    fmax.s fa5, fa5, fa3
+; RV32IZFH-NEXT:    neg a0, a1
 ; RV32IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IZFH-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32IZFH-NEXT:    and a0, a0, a1
@@ -6311,13 +6311,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV64IZFH-LABEL: fcvt_w_s_sat_i16:
 ; RV64IZFH:       # %bb.0: # %start
 ; RV64IZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV64IZFH-NEXT:    feq.s a0, fa5, fa5
-; RV64IZFH-NEXT:    neg a0, a0
-; RV64IZFH-NEXT:    lui a1, %hi(.LCPI32_0)
-; RV64IZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
-; RV64IZFH-NEXT:    lui a1, 815104
-; RV64IZFH-NEXT:    fmv.w.x fa3, a1
+; RV64IZFH-NEXT:    lui a0, %hi(.LCPI32_0)
+; RV64IZFH-NEXT:    feq.s a1, fa5, fa5
+; RV64IZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
+; RV64IZFH-NEXT:    lui a0, 815104
+; RV64IZFH-NEXT:    fmv.w.x fa3, a0
 ; RV64IZFH-NEXT:    fmax.s fa5, fa5, fa3
+; RV64IZFH-NEXT:    neg a0, a1
 ; RV64IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IZFH-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64IZFH-NEXT:    and a0, a0, a1
@@ -6326,13 +6326,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV32IDZFH-LABEL: fcvt_w_s_sat_i16:
 ; RV32IDZFH:       # %bb.0: # %start
 ; RV32IDZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV32IDZFH-NEXT:    feq.s a0, fa5, fa5
-; RV32IDZFH-NEXT:    neg a0, a0
-; RV32IDZFH-NEXT:    lui a1, %hi(.LCPI32_0)
-; RV32IDZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
-; RV32IDZFH-NEXT:    lui a1, 815104
-; RV32IDZFH-NEXT:    fmv.w.x fa3, a1
+; RV32IDZFH-NEXT:    lui a0, %hi(.LCPI32_0)
+; RV32IDZFH-NEXT:    feq.s a1, fa5, fa5
+; RV32IDZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
+; RV32IDZFH-NEXT:    lui a0, 815104
+; RV32IDZFH-NEXT:    fmv.w.x fa3, a0
 ; RV32IDZFH-NEXT:    fmax.s fa5, fa5, fa3
+; RV32IDZFH-NEXT:    neg a0, a1
 ; RV32IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IDZFH-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32IDZFH-NEXT:    and a0, a0, a1
@@ -6341,13 +6341,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV64IDZFH-LABEL: fcvt_w_s_sat_i16:
 ; RV64IDZFH:       # %bb.0: # %start
 ; RV64IDZFH-NEXT:    fcvt.s.h fa5, fa0
-; RV64IDZFH-NEXT:    feq.s a0, fa5, fa5
-; RV64IDZFH-NEXT:    neg a0, a0
-; RV64IDZFH-NEXT:    lui a1, %hi(.LCPI32_0)
-; RV64IDZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
-; RV64IDZFH-NEXT:    lui a1, 815104
-; RV64IDZFH-NEXT:    fmv.w.x fa3, a1
+; RV64IDZFH-NEXT:    lui a0, %hi(.LCPI32_0)
+; RV64IDZFH-NEXT:    feq.s a1, fa5, fa5
+; RV64IDZFH-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
+; RV64IDZFH-NEXT:    lui a0, 815104
+; RV64IDZFH-NEXT:    fmv.w.x fa3, a0
 ; RV64IDZFH-NEXT:    fmax.s fa5, fa5, fa3
+; RV64IDZFH-NEXT:    neg a0, a1
 ; RV64IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IDZFH-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64IDZFH-NEXT:    and a0, a0, a1
@@ -6356,57 +6356,57 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV32IZHINX-LABEL: fcvt_w_s_sat_i16:
 ; RV32IZHINX:       # %bb.0: # %start
 ; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZHINX-NEXT:    feq.s a1, a0, a0
-; RV32IZHINX-NEXT:    neg a1, a1
-; RV32IZHINX-NEXT:    lui a2, 815104
-; RV32IZHINX-NEXT:    fmax.s a0, a0, a2
+; RV32IZHINX-NEXT:    lui a1, 815104
 ; RV32IZHINX-NEXT:    lui a2, 290816
+; RV32IZHINX-NEXT:    fmax.s a1, a0, a1
+; RV32IZHINX-NEXT:    feq.s a0, a0, a0
 ; RV32IZHINX-NEXT:    addi a2, a2, -512
-; RV32IZHINX-NEXT:    fmin.s a0, a0, a2
-; RV32IZHINX-NEXT:    fcvt.w.s a0, a0, rtz
-; RV32IZHINX-NEXT:    and a0, a1, a0
+; RV32IZHINX-NEXT:    neg a0, a0
+; RV32IZHINX-NEXT:    fmin.s a1, a1, a2
+; RV32IZHINX-NEXT:    fcvt.w.s a1, a1, rtz
+; RV32IZHINX-NEXT:    and a0, a0, a1
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: fcvt_w_s_sat_i16:
 ; RV64IZHINX:       # %bb.0: # %start
 ; RV64IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZHINX-NEXT:    feq.s a1, a0, a0
-; RV64IZHINX-NEXT:    neg a1, a1
-; RV64IZHINX-NEXT:    lui a2, 815104
-; RV64IZHINX-NEXT:    fmax.s a0, a0, a2
+; RV64IZHINX-NEXT:    lui a1, 815104
 ; RV64IZHINX-NEXT:    lui a2, 290816
+; RV64IZHINX-NEXT:    fmax.s a1, a0, a1
+; RV64IZHINX-NEXT:    feq.s a0, a0, a0
 ; RV64IZHINX-NEXT:    addiw a2, a2, -512
-; RV64IZHINX-NEXT:    fmin.s a0, a0, a2
-; RV64IZHINX-NEXT:    fcvt.l.s a0, a0, rtz
-; RV64IZHINX-NEXT:    and a0, a1, a0
+; RV64IZHINX-NEXT:    neg a0, a0
+; RV64IZHINX-NEXT:    fmin.s a1, a1, a2
+; RV64IZHINX-NEXT:    fcvt.l.s a1, a1, rtz
+; RV64IZHINX-NEXT:    and a0, a0, a1
 ; RV64IZHINX-NEXT:    ret
 ;
 ; RV32IZDINXZHINX-LABEL: fcvt_w_s_sat_i16:
 ; RV32IZDINXZHINX:       # %bb.0: # %start
 ; RV32IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZDINXZHINX-NEXT:    feq.s a1, a0, a0
-; RV32IZDINXZHINX-NEXT:    neg a1, a1
-; RV32IZDINXZHINX-NEXT:    lui a2, 815104
-; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, a2
+; RV32IZDINXZHINX-NEXT:    lui a1, 815104
 ; RV32IZDINXZHINX-NEXT:    lui a2, 290816
+; RV32IZDINXZHINX-NEXT:    fmax.s a1, a0, a1
+; RV32IZDINXZHINX-NEXT:    feq.s a0, a0, a0
 ; RV32IZDINXZHINX-NEXT:    addi a2, a2, -512
-; RV32IZDINXZHINX-NEXT:    fmin.s a0, a0, a2
-; RV32IZDINXZHINX-NEXT:    fcvt.w.s a0, a0, rtz
-; RV32IZDINXZHINX-NEXT:    and a0, a1, a0
+; RV32IZDINXZHINX-NEXT:    neg a0, a0
+; RV32IZDINXZHINX-NEXT:    fmin.s a1, a1, a2
+; RV32IZDINXZHINX-NEXT:    fcvt.w.s a1, a1, rtz
+; RV32IZDINXZHINX-NEXT:    and a0, a0, a1
 ; RV32IZDINXZHINX-NEXT:    ret
 ;
 ; RV64IZDINXZHINX-LABEL: fcvt_w_s_sat_i16:
 ; RV64IZDINXZHINX:       # %bb.0: # %start
 ; RV64IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZDINXZHINX-NEXT:    feq.s a1, a0, a0
-; RV64IZDINXZHINX-NEXT:    neg a1, a1
-; RV64IZDINXZHINX-NEXT:    lui a2, 815104
-; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, a2
+; RV64IZDINXZHINX-NEXT:    lui a1, 815104
 ; RV64IZDINXZHINX-NEXT:    lui a2, 290816
+; RV64IZDINXZHINX-NEXT:    fmax.s a1, a0, a1
+; RV64IZDINXZHINX-NEXT:    feq.s a0, a0, a0
 ; RV64IZDINXZHINX-NEXT:    addiw a2, a2, -512
-; RV64IZDINXZHINX-NEXT:    fmin.s a0, a0, a2
-; RV64IZDINXZHINX-NEXT:    fcvt.l.s a0, a0, rtz
-; RV64IZDINXZHINX-NEXT:    and a0, a1, a0
+; RV64IZDINXZHINX-NEXT:    neg a0, a0
+; RV64IZDINXZHINX-NEXT:    fmin.s a1, a1, a2
+; RV64IZDINXZHINX-NEXT:    fcvt.l.s a1, a1, rtz
+; RV64IZDINXZHINX-NEXT:    and a0, a0, a1
 ; RV64IZDINXZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_w_s_sat_i16:
@@ -6505,13 +6505,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    call __extendhfsf2
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a0
-; RV32ID-ILP32-NEXT:    feq.s a0, fa5, fa5
-; RV32ID-ILP32-NEXT:    neg a0, a0
-; RV32ID-ILP32-NEXT:    lui a1, %hi(.LCPI32_0)
-; RV32ID-ILP32-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
-; RV32ID-ILP32-NEXT:    lui a1, 815104
-; RV32ID-ILP32-NEXT:    fmv.w.x fa3, a1
+; RV32ID-ILP32-NEXT:    lui a0, %hi(.LCPI32_0)
+; RV32ID-ILP32-NEXT:    feq.s a1, fa5, fa5
+; RV32ID-ILP32-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
+; RV32ID-ILP32-NEXT:    lui a0, 815104
+; RV32ID-ILP32-NEXT:    fmv.w.x fa3, a0
 ; RV32ID-ILP32-NEXT:    fmax.s fa5, fa5, fa3
+; RV32ID-ILP32-NEXT:    neg a0, a1
 ; RV32ID-ILP32-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-ILP32-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32ID-ILP32-NEXT:    and a0, a0, a1
@@ -6525,13 +6525,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV64ID-LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-LP64-NEXT:    call __extendhfsf2
 ; RV64ID-LP64-NEXT:    fmv.w.x fa5, a0
-; RV64ID-LP64-NEXT:    feq.s a0, fa5, fa5
-; RV64ID-LP64-NEXT:    neg a0, a0
-; RV64ID-LP64-NEXT:    lui a1, %hi(.LCPI32_0)
-; RV64ID-LP64-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
-; RV64ID-LP64-NEXT:    lui a1, 815104
-; RV64ID-LP64-NEXT:    fmv.w.x fa3, a1
+; RV64ID-LP64-NEXT:    lui a0, %hi(.LCPI32_0)
+; RV64ID-LP64-NEXT:    feq.s a1, fa5, fa5
+; RV64ID-LP64-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
+; RV64ID-LP64-NEXT:    lui a0, 815104
+; RV64ID-LP64-NEXT:    fmv.w.x fa3, a0
 ; RV64ID-LP64-NEXT:    fmax.s fa5, fa5, fa3
+; RV64ID-LP64-NEXT:    neg a0, a1
 ; RV64ID-LP64-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-LP64-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64ID-LP64-NEXT:    and a0, a0, a1
@@ -6545,12 +6545,12 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV32ID-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    call __extendhfsf2
 ; RV32ID-NEXT:    feq.s a0, fa0, fa0
-; RV32ID-NEXT:    neg a0, a0
 ; RV32ID-NEXT:    lui a1, %hi(.LCPI32_0)
 ; RV32ID-NEXT:    flw fa5, %lo(.LCPI32_0)(a1)
 ; RV32ID-NEXT:    lui a1, 815104
 ; RV32ID-NEXT:    fmv.w.x fa4, a1
 ; RV32ID-NEXT:    fmax.s fa4, fa0, fa4
+; RV32ID-NEXT:    neg a0, a0
 ; RV32ID-NEXT:    fmin.s fa5, fa4, fa5
 ; RV32ID-NEXT:    fcvt.w.s a1, fa5, rtz
 ; RV32ID-NEXT:    and a0, a0, a1
@@ -6564,12 +6564,12 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; RV64ID-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    call __extendhfsf2
 ; RV64ID-NEXT:    feq.s a0, fa0, fa0
-; RV64ID-NEXT:    neg a0, a0
 ; RV64ID-NEXT:    lui a1, %hi(.LCPI32_0)
 ; RV64ID-NEXT:    flw fa5, %lo(.LCPI32_0)(a1)
 ; RV64ID-NEXT:    lui a1, 815104
 ; RV64ID-NEXT:    fmv.w.x fa4, a1
 ; RV64ID-NEXT:    fmax.s fa4, fa0, fa4
+; RV64ID-NEXT:    neg a0, a0
 ; RV64ID-NEXT:    fmin.s fa5, fa4, fa5
 ; RV64ID-NEXT:    fcvt.l.s a1, fa5, rtz
 ; RV64ID-NEXT:    and a0, a0, a1
@@ -6580,13 +6580,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; CHECK32-IZFHMIN-LABEL: fcvt_w_s_sat_i16:
 ; CHECK32-IZFHMIN:       # %bb.0: # %start
 ; CHECK32-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECK32-IZFHMIN-NEXT:    feq.s a0, fa5, fa5
-; CHECK32-IZFHMIN-NEXT:    neg a0, a0
-; CHECK32-IZFHMIN-NEXT:    lui a1, %hi(.LCPI32_0)
-; CHECK32-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
-; CHECK32-IZFHMIN-NEXT:    lui a1, 815104
-; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa3, a1
+; CHECK32-IZFHMIN-NEXT:    lui a0, %hi(.LCPI32_0)
+; CHECK32-IZFHMIN-NEXT:    feq.s a1, fa5, fa5
+; CHECK32-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
+; CHECK32-IZFHMIN-NEXT:    lui a0, 815104
+; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa3, a0
 ; CHECK32-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa3
+; CHECK32-IZFHMIN-NEXT:    neg a0, a1
 ; CHECK32-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32-IZFHMIN-NEXT:    fcvt.w.s a1, fa5, rtz
 ; CHECK32-IZFHMIN-NEXT:    and a0, a0, a1
@@ -6595,13 +6595,13 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; CHECK64-IZFHMIN-LABEL: fcvt_w_s_sat_i16:
 ; CHECK64-IZFHMIN:       # %bb.0: # %start
 ; CHECK64-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; CHECK64-IZFHMIN-NEXT:    feq.s a0, fa5, fa5
-; CHECK64-IZFHMIN-NEXT:    neg a0, a0
-; CHECK64-IZFHMIN-NEXT:    lui a1, %hi(.LCPI32_0)
-; CHECK64-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI32_0)(a1)
-; CHECK64-IZFHMIN-NEXT:    lui a1, 815104
-; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa3, a1
+; CHECK64-IZFHMIN-NEXT:    lui a0, %hi(.LCPI32_0)
+; CHECK64-IZFHMIN-NEXT:    feq.s a1, fa5, fa5
+; CHECK64-IZFHMIN-NEXT:    flw fa4, %lo(.LCPI32_0)(a0)
+; CHECK64-IZFHMIN-NEXT:    lui a0, 815104
+; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa3, a0
 ; CHECK64-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa3
+; CHECK64-IZFHMIN-NEXT:    neg a0, a1
 ; CHECK64-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64-IZFHMIN-NEXT:    fcvt.l.s a1, fa5, rtz
 ; CHECK64-IZFHMIN-NEXT:    and a0, a0, a1
@@ -6610,57 +6610,57 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind {
 ; CHECK32-IZHINXMIN-LABEL: fcvt_w_s_sat_i16:
 ; CHECK32-IZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK32-IZHINXMIN-NEXT:    neg a1, a1
-; CHECK32-IZHINXMIN-NEXT:    lui a2, 815104
-; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK32-IZHINXMIN-NEXT:    lui a1, 815104
 ; CHECK32-IZHINXMIN-NEXT:    lui a2, 290816
+; CHECK32-IZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK32-IZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECK32-IZHINXMIN-NEXT:    addi a2, a2, -512
-; CHECK32-IZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK32-IZHINXMIN-NEXT:    fcvt.w.s a0, a0, rtz
-; CHECK32-IZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK32-IZHINXMIN-NEXT:    neg a0, a0
+; CHECK32-IZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK32-IZHINXMIN-NEXT:    fcvt.w.s a1, a1, rtz
+; CHECK32-IZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK32-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZHINXMIN-LABEL: fcvt_w_s_sat_i16:
 ; CHECK64-IZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK64-IZHINXMIN-NEXT:    neg a1, a1
-; CHECK64-IZHINXMIN-NEXT:    lui a2, 815104
-; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK64-IZHINXMIN-NEXT:    lui a1, 815104
 ; CHECK64-IZHINXMIN-NEXT:    lui a2, 290816
+; CHECK64-IZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK64-IZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECK64-IZHINXMIN-NEXT:    addiw a2, a2, -512
-; CHECK64-IZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK64-IZHINXMIN-NEXT:    fcvt.l.s a0, a0, rtz
-; CHECK64-IZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK64-IZHINXMIN-NEXT:    neg a0, a0
+; CHECK64-IZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK64-IZHINXMIN-NEXT:    fcvt.l.s a1, a1, rtz
+; CHECK64-IZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK64-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_w_s_sat_i16:
 ; CHECK32-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    neg a1, a1
-; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, 815104
-; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, 815104
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, 290816
+; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK32-IZDINXZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECK32-IZDINXZHINXMIN-NEXT:    addi a2, a2, -512
-; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.w.s a0, a0, rtz
-; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK32-IZDINXZHINXMIN-NEXT:    neg a0, a0
+; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.w.s a1, a1, rtz
+; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK32-IZDINXZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_w_s_sat_i16:
 ; CHECK64-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZDINXZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK64-IZDINXZHINXMIN-NEXT:    neg a1, a1
-; CHECK64-IZDINXZHINXMIN-NEXT:    lui a2, 815104
-; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK64-IZDINXZHINXMIN-NEXT:    lui a1, 815104
 ; CHECK64-IZDINXZHINXMIN-NEXT:    lui a2, 290816
+; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK64-IZDINXZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECK64-IZDINXZHINXMIN-NEXT:    addiw a2, a2, -512
-; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.l.s a0, a0, rtz
-; CHECK64-IZDINXZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK64-IZDINXZHINXMIN-NEXT:    neg a0, a0
+; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.l.s a1, a1, rtz
+; CHECK64-IZDINXZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK64-IZDINXZHINXMIN-NEXT:    ret
 start:
   %0 = tail call i16 @llvm.fptosi.sat.i16.f16(half %a)
@@ -6861,8 +6861,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; RV32IZHINX-LABEL: fcvt_wu_s_sat_i16:
 ; RV32IZHINX:       # %bb.0: # %start
 ; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV32IZHINX-NEXT:    lui a1, 292864
+; RV32IZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV32IZHINX-NEXT:    addi a1, a1, -256
 ; RV32IZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV32IZHINX-NEXT:    fcvt.wu.s a0, a0, rtz
@@ -6871,8 +6871,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; RV64IZHINX-LABEL: fcvt_wu_s_sat_i16:
 ; RV64IZHINX:       # %bb.0: # %start
 ; RV64IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV64IZHINX-NEXT:    lui a1, 292864
+; RV64IZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV64IZHINX-NEXT:    addiw a1, a1, -256
 ; RV64IZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV64IZHINX-NEXT:    fcvt.lu.s a0, a0, rtz
@@ -6881,8 +6881,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; RV32IZDINXZHINX-LABEL: fcvt_wu_s_sat_i16:
 ; RV32IZDINXZHINX:       # %bb.0: # %start
 ; RV32IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV32IZDINXZHINX-NEXT:    lui a1, 292864
+; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV32IZDINXZHINX-NEXT:    addi a1, a1, -256
 ; RV32IZDINXZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV32IZDINXZHINX-NEXT:    fcvt.wu.s a0, a0, rtz
@@ -6891,8 +6891,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; RV64IZDINXZHINX-LABEL: fcvt_wu_s_sat_i16:
 ; RV64IZDINXZHINX:       # %bb.0: # %start
 ; RV64IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV64IZDINXZHINX-NEXT:    lui a1, 292864
+; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, zero
 ; RV64IZDINXZHINX-NEXT:    addiw a1, a1, -256
 ; RV64IZDINXZHINX-NEXT:    fmin.s a0, a0, a1
 ; RV64IZDINXZHINX-NEXT:    fcvt.lu.s a0, a0, rtz
@@ -7067,8 +7067,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; CHECK32-IZHINXMIN-LABEL: fcvt_wu_s_sat_i16:
 ; CHECK32-IZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK32-IZHINXMIN-NEXT:    lui a1, 292864
+; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK32-IZHINXMIN-NEXT:    addi a1, a1, -256
 ; CHECK32-IZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.wu.s a0, a0, rtz
@@ -7077,8 +7077,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; CHECK64-IZHINXMIN-LABEL: fcvt_wu_s_sat_i16:
 ; CHECK64-IZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK64-IZHINXMIN-NEXT:    lui a1, 292864
+; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK64-IZHINXMIN-NEXT:    addiw a1, a1, -256
 ; CHECK64-IZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.lu.s a0, a0, rtz
@@ -7087,8 +7087,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_wu_s_sat_i16:
 ; CHECK32-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, 292864
+; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK32-IZDINXZHINXMIN-NEXT:    addi a1, a1, -256
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.wu.s a0, a0, rtz
@@ -7097,8 +7097,8 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind {
 ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_wu_s_sat_i16:
 ; CHECK64-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK64-IZDINXZHINXMIN-NEXT:    lui a1, 292864
+; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, zero
 ; CHECK64-IZDINXZHINXMIN-NEXT:    addiw a1, a1, -256
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a1
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.lu.s a0, a0, rtz
@@ -7258,12 +7258,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; RV32IZFH-LABEL: fcvt_w_s_sat_i8:
 ; RV32IZFH:       # %bb.0: # %start
 ; RV32IZFH-NEXT:    fcvt.s.h fa5, fa0
+; RV32IZFH-NEXT:    lui a0, 798720
+; RV32IZFH-NEXT:    lui a1, 274400
+; RV32IZFH-NEXT:    fmv.w.x fa4, a0
 ; RV32IZFH-NEXT:    feq.s a0, fa5, fa5
 ; RV32IZFH-NEXT:    neg a0, a0
-; RV32IZFH-NEXT:    lui a1, 798720
-; RV32IZFH-NEXT:    fmv.w.x fa4, a1
 ; RV32IZFH-NEXT:    fmax.s fa5, fa5, fa4
-; RV32IZFH-NEXT:    lui a1, 274400
 ; RV32IZFH-NEXT:    fmv.w.x fa4, a1
 ; RV32IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IZFH-NEXT:    fcvt.w.s a1, fa5, rtz
@@ -7273,12 +7273,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; RV64IZFH-LABEL: fcvt_w_s_sat_i8:
 ; RV64IZFH:       # %bb.0: # %start
 ; RV64IZFH-NEXT:    fcvt.s.h fa5, fa0
+; RV64IZFH-NEXT:    lui a0, 798720
+; RV64IZFH-NEXT:    lui a1, 274400
+; RV64IZFH-NEXT:    fmv.w.x fa4, a0
 ; RV64IZFH-NEXT:    feq.s a0, fa5, fa5
 ; RV64IZFH-NEXT:    neg a0, a0
-; RV64IZFH-NEXT:    lui a1, 798720
-; RV64IZFH-NEXT:    fmv.w.x fa4, a1
 ; RV64IZFH-NEXT:    fmax.s fa5, fa5, fa4
-; RV64IZFH-NEXT:    lui a1, 274400
 ; RV64IZFH-NEXT:    fmv.w.x fa4, a1
 ; RV64IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IZFH-NEXT:    fcvt.l.s a1, fa5, rtz
@@ -7288,12 +7288,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; RV32IDZFH-LABEL: fcvt_w_s_sat_i8:
 ; RV32IDZFH:       # %bb.0: # %start
 ; RV32IDZFH-NEXT:    fcvt.s.h fa5, fa0
+; RV32IDZFH-NEXT:    lui a0, 798720
+; RV32IDZFH-NEXT:    lui a1, 274400
+; RV32IDZFH-NEXT:    fmv.w.x fa4, a0
 ; RV32IDZFH-NEXT:    feq.s a0, fa5, fa5
 ; RV32IDZFH-NEXT:    neg a0, a0
-; RV32IDZFH-NEXT:    lui a1, 798720
-; RV32IDZFH-NEXT:    fmv.w.x fa4, a1
 ; RV32IDZFH-NEXT:    fmax.s fa5, fa5, fa4
-; RV32IDZFH-NEXT:    lui a1, 274400
 ; RV32IDZFH-NEXT:    fmv.w.x fa4, a1
 ; RV32IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IDZFH-NEXT:    fcvt.w.s a1, fa5, rtz
@@ -7303,12 +7303,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; RV64IDZFH-LABEL: fcvt_w_s_sat_i8:
 ; RV64IDZFH:       # %bb.0: # %start
 ; RV64IDZFH-NEXT:    fcvt.s.h fa5, fa0
+; RV64IDZFH-NEXT:    lui a0, 798720
+; RV64IDZFH-NEXT:    lui a1, 274400
+; RV64IDZFH-NEXT:    fmv.w.x fa4, a0
 ; RV64IDZFH-NEXT:    feq.s a0, fa5, fa5
 ; RV64IDZFH-NEXT:    neg a0, a0
-; RV64IDZFH-NEXT:    lui a1, 798720
-; RV64IDZFH-NEXT:    fmv.w.x fa4, a1
 ; RV64IDZFH-NEXT:    fmax.s fa5, fa5, fa4
-; RV64IDZFH-NEXT:    lui a1, 274400
 ; RV64IDZFH-NEXT:    fmv.w.x fa4, a1
 ; RV64IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IDZFH-NEXT:    fcvt.l.s a1, fa5, rtz
@@ -7318,53 +7318,53 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; RV32IZHINX-LABEL: fcvt_w_s_sat_i8:
 ; RV32IZHINX:       # %bb.0: # %start
 ; RV32IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZHINX-NEXT:    feq.s a1, a0, a0
-; RV32IZHINX-NEXT:    neg a1, a1
-; RV32IZHINX-NEXT:    lui a2, 798720
-; RV32IZHINX-NEXT:    fmax.s a0, a0, a2
+; RV32IZHINX-NEXT:    lui a1, 798720
 ; RV32IZHINX-NEXT:    lui a2, 274400
-; RV32IZHINX-NEXT:    fmin.s a0, a0, a2
-; RV32IZHINX-NEXT:    fcvt.w.s a0, a0, rtz
-; RV32IZHINX-NEXT:    and a0, a1, a0
+; RV32IZHINX-NEXT:    fmax.s a1, a0, a1
+; RV32IZHINX-NEXT:    feq.s a0, a0, a0
+; RV32IZHINX-NEXT:    neg a0, a0
+; RV32IZHINX-NEXT:    fmin.s a1, a1, a2
+; RV32IZHINX-NEXT:    fcvt.w.s a1, a1, rtz
+; RV32IZHINX-NEXT:    and a0, a0, a1
 ; RV32IZHINX-NEXT:    ret
 ;
 ; RV64IZHINX-LABEL: fcvt_w_s_sat_i8:
 ; RV64IZHINX:       # %bb.0: # %start
 ; RV64IZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZHINX-NEXT:    feq.s a1, a0, a0
-; RV64IZHINX-NEXT:    neg a1, a1
-; RV64IZHINX-NEXT:    lui a2, 798720
-; RV64IZHINX-NEXT:    fmax.s a0, a0, a2
+; RV64IZHINX-NEXT:    lui a1, 798720
 ; RV64IZHINX-NEXT:    lui a2, 274400
-; RV64IZHINX-NEXT:    fmin.s a0, a0, a2
-; RV64IZHINX-NEXT:    fcvt.l.s a0, a0, rtz
-; RV64IZHINX-NEXT:    and a0, a1, a0
+; RV64IZHINX-NEXT:    fmax.s a1, a0, a1
+; RV64IZHINX-NEXT:    feq.s a0, a0, a0
+; RV64IZHINX-NEXT:    neg a0, a0
+; RV64IZHINX-NEXT:    fmin.s a1, a1, a2
+; RV64IZHINX-NEXT:    fcvt.l.s a1, a1, rtz
+; RV64IZHINX-NEXT:    and a0, a0, a1
 ; RV64IZHINX-NEXT:    ret
 ;
 ; RV32IZDINXZHINX-LABEL: fcvt_w_s_sat_i8:
 ; RV32IZDINXZHINX:       # %bb.0: # %start
 ; RV32IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV32IZDINXZHINX-NEXT:    feq.s a1, a0, a0
-; RV32IZDINXZHINX-NEXT:    neg a1, a1
-; RV32IZDINXZHINX-NEXT:    lui a2, 798720
-; RV32IZDINXZHINX-NEXT:    fmax.s a0, a0, a2
+; RV32IZDINXZHINX-NEXT:    lui a1, 798720
 ; RV32IZDINXZHINX-NEXT:    lui a2, 274400
-; RV32IZDINXZHINX-NEXT:    fmin.s a0, a0, a2
-; RV32IZDINXZHINX-NEXT:    fcvt.w.s a0, a0, rtz
-; RV32IZDINXZHINX-NEXT:    and a0, a1, a0
+; RV32IZDINXZHINX-NEXT:    fmax.s a1, a0, a1
+; RV32IZDINXZHINX-NEXT:    feq.s a0, a0, a0
+; RV32IZDINXZHINX-NEXT:    neg a0, a0
+; RV32IZDINXZHINX-NEXT:    fmin.s a1, a1, a2
+; RV32IZDINXZHINX-NEXT:    fcvt.w.s a1, a1, rtz
+; RV32IZDINXZHINX-NEXT:    and a0, a0, a1
 ; RV32IZDINXZHINX-NEXT:    ret
 ;
 ; RV64IZDINXZHINX-LABEL: fcvt_w_s_sat_i8:
 ; RV64IZDINXZHINX:       # %bb.0: # %start
 ; RV64IZDINXZHINX-NEXT:    fcvt.s.h a0, a0
-; RV64IZDINXZHINX-NEXT:    feq.s a1, a0, a0
-; RV64IZDINXZHINX-NEXT:    neg a1, a1
-; RV64IZDINXZHINX-NEXT:    lui a2, 798720
-; RV64IZDINXZHINX-NEXT:    fmax.s a0, a0, a2
+; RV64IZDINXZHINX-NEXT:    lui a1, 798720
 ; RV64IZDINXZHINX-NEXT:    lui a2, 274400
-; RV64IZDINXZHINX-NEXT:    fmin.s a0, a0, a2
-; RV64IZDINXZHINX-NEXT:    fcvt.l.s a0, a0, rtz
-; RV64IZDINXZHINX-NEXT:    and a0, a1, a0
+; RV64IZDINXZHINX-NEXT:    fmax.s a1, a0, a1
+; RV64IZDINXZHINX-NEXT:    feq.s a0, a0, a0
+; RV64IZDINXZHINX-NEXT:    neg a0, a0
+; RV64IZDINXZHINX-NEXT:    fmin.s a1, a1, a2
+; RV64IZDINXZHINX-NEXT:    fcvt.l.s a1, a1, rtz
+; RV64IZDINXZHINX-NEXT:    and a0, a0, a1
 ; RV64IZDINXZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_w_s_sat_i8:
@@ -7459,12 +7459,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-ILP32-NEXT:    call __extendhfsf2
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a0
+; RV32ID-ILP32-NEXT:    lui a0, 798720
+; RV32ID-ILP32-NEXT:    lui a1, 274400
+; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-ILP32-NEXT:    feq.s a0, fa5, fa5
 ; RV32ID-ILP32-NEXT:    neg a0, a0
-; RV32ID-ILP32-NEXT:    lui a1, 798720
-; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a1
 ; RV32ID-ILP32-NEXT:    fmax.s fa5, fa5, fa4
-; RV32ID-ILP32-NEXT:    lui a1, 274400
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a1
 ; RV32ID-ILP32-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-ILP32-NEXT:    fcvt.w.s a1, fa5, rtz
@@ -7479,12 +7479,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; RV64ID-LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-LP64-NEXT:    call __extendhfsf2
 ; RV64ID-LP64-NEXT:    fmv.w.x fa5, a0
+; RV64ID-LP64-NEXT:    lui a0, 798720
+; RV64ID-LP64-NEXT:    lui a1, 274400
+; RV64ID-LP64-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-LP64-NEXT:    feq.s a0, fa5, fa5
 ; RV64ID-LP64-NEXT:    neg a0, a0
-; RV64ID-LP64-NEXT:    lui a1, 798720
-; RV64ID-LP64-NEXT:    fmv.w.x fa4, a1
 ; RV64ID-LP64-NEXT:    fmax.s fa5, fa5, fa4
-; RV64ID-LP64-NEXT:    lui a1, 274400
 ; RV64ID-LP64-NEXT:    fmv.w.x fa4, a1
 ; RV64ID-LP64-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-LP64-NEXT:    fcvt.l.s a1, fa5, rtz
@@ -7499,11 +7499,11 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; RV32ID-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    call __extendhfsf2
 ; RV32ID-NEXT:    feq.s a0, fa0, fa0
-; RV32ID-NEXT:    neg a0, a0
 ; RV32ID-NEXT:    lui a1, 798720
 ; RV32ID-NEXT:    fmv.w.x fa5, a1
-; RV32ID-NEXT:    fmax.s fa5, fa0, fa5
 ; RV32ID-NEXT:    lui a1, 274400
+; RV32ID-NEXT:    neg a0, a0
+; RV32ID-NEXT:    fmax.s fa5, fa0, fa5
 ; RV32ID-NEXT:    fmv.w.x fa4, a1
 ; RV32ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-NEXT:    fcvt.w.s a1, fa5, rtz
@@ -7518,11 +7518,11 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; RV64ID-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    call __extendhfsf2
 ; RV64ID-NEXT:    feq.s a0, fa0, fa0
-; RV64ID-NEXT:    neg a0, a0
 ; RV64ID-NEXT:    lui a1, 798720
 ; RV64ID-NEXT:    fmv.w.x fa5, a1
-; RV64ID-NEXT:    fmax.s fa5, fa0, fa5
 ; RV64ID-NEXT:    lui a1, 274400
+; RV64ID-NEXT:    neg a0, a0
+; RV64ID-NEXT:    fmax.s fa5, fa0, fa5
 ; RV64ID-NEXT:    fmv.w.x fa4, a1
 ; RV64ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-NEXT:    fcvt.l.s a1, fa5, rtz
@@ -7534,12 +7534,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; CHECK32-IZFHMIN-LABEL: fcvt_w_s_sat_i8:
 ; CHECK32-IZFHMIN:       # %bb.0: # %start
 ; CHECK32-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
+; CHECK32-IZFHMIN-NEXT:    lui a0, 798720
+; CHECK32-IZFHMIN-NEXT:    lui a1, 274400
+; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa4, a0
 ; CHECK32-IZFHMIN-NEXT:    feq.s a0, fa5, fa5
 ; CHECK32-IZFHMIN-NEXT:    neg a0, a0
-; CHECK32-IZFHMIN-NEXT:    lui a1, 798720
-; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa4, a1
 ; CHECK32-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
-; CHECK32-IZFHMIN-NEXT:    lui a1, 274400
 ; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa4, a1
 ; CHECK32-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32-IZFHMIN-NEXT:    fcvt.w.s a1, fa5, rtz
@@ -7549,12 +7549,12 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; CHECK64-IZFHMIN-LABEL: fcvt_w_s_sat_i8:
 ; CHECK64-IZFHMIN:       # %bb.0: # %start
 ; CHECK64-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
+; CHECK64-IZFHMIN-NEXT:    lui a0, 798720
+; CHECK64-IZFHMIN-NEXT:    lui a1, 274400
+; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa4, a0
 ; CHECK64-IZFHMIN-NEXT:    feq.s a0, fa5, fa5
 ; CHECK64-IZFHMIN-NEXT:    neg a0, a0
-; CHECK64-IZFHMIN-NEXT:    lui a1, 798720
-; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa4, a1
 ; CHECK64-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
-; CHECK64-IZFHMIN-NEXT:    lui a1, 274400
 ; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa4, a1
 ; CHECK64-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64-IZFHMIN-NEXT:    fcvt.l.s a1, fa5, rtz
@@ -7564,53 +7564,53 @@ define signext i8 @fcvt_w_s_sat_i8(half %a) nounwind {
 ; CHECK32-IZHINXMIN-LABEL: fcvt_w_s_sat_i8:
 ; CHECK32-IZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK32-IZHINXMIN-NEXT:    neg a1, a1
-; CHECK32-IZHINXMIN-NEXT:    lui a2, 798720
-; CHECK32-IZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK32-IZHINXMIN-NEXT:    lui a1, 798720
 ; CHECK32-IZHINXMIN-NEXT:    lui a2, 274400
-; CHECK32-IZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK32-IZHINXMIN-NEXT:    fcvt.w.s a0, a0, rtz
-; CHECK32-IZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK32-IZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK32-IZHINXMIN-NEXT:    feq.s a0, a0, a0
+; CHECK32-IZHINXMIN-NEXT:    neg a0, a0
+; CHECK32-IZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK32-IZHINXMIN-NEXT:    fcvt.w.s a1, a1, rtz
+; CHECK32-IZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK32-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZHINXMIN-LABEL: fcvt_w_s_sat_i8:
 ; CHECK64-IZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK64-IZHINXMIN-NEXT:    neg a1, a1
-; CHECK64-IZHINXMIN-NEXT:    lui a2, 798720
-; CHECK64-IZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK64-IZHINXMIN-NEXT:    lui a1, 798720
 ; CHECK64-IZHINXMIN-NEXT:    lui a2, 274400
-; CHECK64-IZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK64-IZHINXMIN-NEXT:    fcvt.l.s a0, a0, rtz
-; CHECK64-IZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK64-IZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK64-IZHINXMIN-NEXT:    feq.s a0, a0, a0
+; CHECK64-IZHINXMIN-NEXT:    neg a0, a0
+; CHECK64-IZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK64-IZHINXMIN-NEXT:    fcvt.l.s a1, a1, rtz
+; CHECK64-IZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK64-IZHINXMIN-NEXT:    ret
 ;
 ; CHECK32-IZDINXZHINXMIN-LABEL: fcvt_w_s_sat_i8:
 ; CHECK32-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK32-IZDINXZHINXMIN-NEXT:    neg a1, a1
-; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, 798720
-; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK32-IZDINXZHINXMIN-NEXT:    lui a1, 798720
 ; CHECK32-IZDINXZHINXMIN-NEXT:    lui a2, 274400
-; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.w.s a0, a0, rtz
-; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK32-IZDINXZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK32-IZDINXZHINXMIN-NEXT:    feq.s a0, a0, a0
+; CHECK32-IZDINXZHINXMIN-NEXT:    neg a0, a0
+; CHECK32-IZDINXZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK32-IZDINXZHINXMIN-NEXT:    fcvt.w.s a1, a1, rtz
+; CHECK32-IZDINXZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK32-IZDINXZHINXMIN-NEXT:    ret
 ;
 ; CHECK64-IZDINXZHINXMIN-LABEL: fcvt_w_s_sat_i8:
 ; CHECK64-IZDINXZHINXMIN:       # %bb.0: # %start
 ; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.s.h a0, a0
-; CHECK64-IZDINXZHINXMIN-NEXT:    feq.s a1, a0, a0
-; CHECK64-IZDINXZHINXMIN-NEXT:    neg a1, a1
-; CHECK64-IZDINXZHINXMIN-NEXT:    lui a2, 798720
-; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a0, a0, a2
+; CHECK64-IZDINXZHINXMIN-NEXT:    lui a1, 798720
 ; CHECK64-IZDINXZHINXMIN-NEXT:    lui a2, 274400
-; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a0, a0, a2
-; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.l.s a0, a0, rtz
-; CHECK64-IZDINXZHINXMIN-NEXT:    and a0, a1, a0
+; CHECK64-IZDINXZHINXMIN-NEXT:    fmax.s a1, a0, a1
+; CHECK64-IZDINXZHINXMIN-NEXT:    feq.s a0, a0, a0
+; CHECK64-IZDINXZHINXMIN-NEXT:    neg a0, a0
+; CHECK64-IZDINXZHINXMIN-NEXT:    fmin.s a1, a1, a2
+; CHECK64-IZDINXZHINXMIN-NEXT:    fcvt.l.s a1, a1, rtz
+; CHECK64-IZDINXZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECK64-IZDINXZHINXMIN-NEXT:    ret
 start:
   %0 = tail call i8 @llvm.fptosi.sat.i8.f16(half %a)
@@ -7769,8 +7769,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; RV32IZFH:       # %bb.0: # %start
 ; RV32IZFH-NEXT:    fcvt.s.h fa5, fa0
 ; RV32IZFH-NEXT:    fmv.w.x fa4, zero
-; RV32IZFH-NEXT:    fmax.s fa5, fa5, fa4
 ; RV32IZFH-NEXT:    lui a0, 276464
+; RV32IZFH-NEXT:    fmax.s fa5, fa5, fa4
 ; RV32IZFH-NEXT:    fmv.w.x fa4, a0
 ; RV32IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IZFH-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -7780,8 +7780,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; RV64IZFH:       # %bb.0: # %start
 ; RV64IZFH-NEXT:    fcvt.s.h fa5, fa0
 ; RV64IZFH-NEXT:    fmv.w.x fa4, zero
-; RV64IZFH-NEXT:    fmax.s fa5, fa5, fa4
 ; RV64IZFH-NEXT:    lui a0, 276464
+; RV64IZFH-NEXT:    fmax.s fa5, fa5, fa4
 ; RV64IZFH-NEXT:    fmv.w.x fa4, a0
 ; RV64IZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IZFH-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -7791,8 +7791,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; RV32IDZFH:       # %bb.0: # %start
 ; RV32IDZFH-NEXT:    fcvt.s.h fa5, fa0
 ; RV32IDZFH-NEXT:    fmv.w.x fa4, zero
-; RV32IDZFH-NEXT:    fmax.s fa5, fa5, fa4
 ; RV32IDZFH-NEXT:    lui a0, 276464
+; RV32IDZFH-NEXT:    fmax.s fa5, fa5, fa4
 ; RV32IDZFH-NEXT:    fmv.w.x fa4, a0
 ; RV32IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32IDZFH-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -7802,8 +7802,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; RV64IDZFH:       # %bb.0: # %start
 ; RV64IDZFH-NEXT:    fcvt.s.h fa5, fa0
 ; RV64IDZFH-NEXT:    fmv.w.x fa4, zero
-; RV64IDZFH-NEXT:    fmax.s fa5, fa5, fa4
 ; RV64IDZFH-NEXT:    lui a0, 276464
+; RV64IDZFH-NEXT:    fmax.s fa5, fa5, fa4
 ; RV64IDZFH-NEXT:    fmv.w.x fa4, a0
 ; RV64IDZFH-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64IDZFH-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -7926,8 +7926,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; RV32ID-ILP32-NEXT:    call __extendhfsf2
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa5, a0
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa4, zero
-; RV32ID-ILP32-NEXT:    fmax.s fa5, fa5, fa4
 ; RV32ID-ILP32-NEXT:    lui a0, 276464
+; RV32ID-ILP32-NEXT:    fmax.s fa5, fa5, fa4
 ; RV32ID-ILP32-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-ILP32-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-ILP32-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -7942,8 +7942,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; RV64ID-LP64-NEXT:    call __extendhfsf2
 ; RV64ID-LP64-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-LP64-NEXT:    fmv.w.x fa4, zero
-; RV64ID-LP64-NEXT:    fmax.s fa5, fa5, fa4
 ; RV64ID-LP64-NEXT:    lui a0, 276464
+; RV64ID-LP64-NEXT:    fmax.s fa5, fa5, fa4
 ; RV64ID-LP64-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-LP64-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-LP64-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -7957,8 +7957,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; RV32ID-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    call __extendhfsf2
 ; RV32ID-NEXT:    fmv.w.x fa5, zero
-; RV32ID-NEXT:    fmax.s fa5, fa0, fa5
 ; RV32ID-NEXT:    lui a0, 276464
+; RV32ID-NEXT:    fmax.s fa5, fa0, fa5
 ; RV32ID-NEXT:    fmv.w.x fa4, a0
 ; RV32ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV32ID-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -7972,8 +7972,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; RV64ID-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-NEXT:    call __extendhfsf2
 ; RV64ID-NEXT:    fmv.w.x fa5, zero
-; RV64ID-NEXT:    fmax.s fa5, fa0, fa5
 ; RV64ID-NEXT:    lui a0, 276464
+; RV64ID-NEXT:    fmax.s fa5, fa0, fa5
 ; RV64ID-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-NEXT:    fmin.s fa5, fa5, fa4
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -7985,8 +7985,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; CHECK32-IZFHMIN:       # %bb.0: # %start
 ; CHECK32-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK32-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
 ; CHECK32-IZFHMIN-NEXT:    lui a0, 276464
+; CHECK32-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
 ; CHECK32-IZFHMIN-NEXT:    fmv.w.x fa4, a0
 ; CHECK32-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK32-IZFHMIN-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -7996,8 +7996,8 @@ define zeroext i8 @fcvt_wu_s_sat_i8(half %a) nounwind {
 ; CHECK64-IZFHMIN:       # %bb.0: # %start
 ; CHECK64-IZFHMIN-NEXT:    fcvt.s.h fa5, fa0
 ; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa4, zero
-; CHECK64-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
 ; CHECK64-IZFHMIN-NEXT:    lui a0, 276464
+; CHECK64-IZFHMIN-NEXT:    fmax.s fa5, fa5, fa4
 ; CHECK64-IZFHMIN-NEXT:    fmv.w.x fa4, a0
 ; CHECK64-IZFHMIN-NEXT:    fmin.s fa5, fa5, fa4
 ; CHECK64-IZFHMIN-NEXT:    fcvt.lu.s a0, fa5, rtz

diff  --git a/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll
index 4bc595bcc4cc8f..12cf088e3205fd 100644
--- a/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll
+++ b/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll
@@ -297,8 +297,8 @@ define i32 @fcmp_ord(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINXMIN-LABEL: fcmp_ord:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECKIZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECKIZHINXMIN-NEXT:    ret
@@ -608,8 +608,8 @@ define i32 @fcmp_uno(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINXMIN-LABEL: fcmp_uno:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECKIZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECKIZHINXMIN-NEXT:    xori a0, a0, 1
@@ -823,8 +823,8 @@ define i32 @fcmps_ord(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINXMIN-LABEL: fcmps_ord:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    fle.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fle.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    fle.s a0, a0, a0
 ; CHECKIZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECKIZHINXMIN-NEXT:    ret
@@ -1063,8 +1063,8 @@ define i32 @fcmps_uno(half %a, half %b) nounwind strictfp {
 ; CHECKIZHINXMIN-LABEL: fcmps_uno:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    fle.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    fle.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    fle.s a0, a0, a0
 ; CHECKIZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECKIZHINXMIN-NEXT:    xori a0, a0, 1

diff  --git a/llvm/test/CodeGen/RISCV/half-fcmp.ll b/llvm/test/CodeGen/RISCV/half-fcmp.ll
index 81ef56635eebcf..d25d8cc1c19948 100644
--- a/llvm/test/CodeGen/RISCV/half-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/half-fcmp.ll
@@ -426,9 +426,9 @@ define i32 @fcmp_ord(half %a, half %b) nounwind {
 ; CHECKIZFHMIN-ILP32F-LP64F-LABEL: fcmp_ord:
 ; CHECKIZFHMIN-ILP32F-LP64F:       # %bb.0:
 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    fcvt.s.h fa5, fa1
+; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    fcvt.s.h fa4, fa0
 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    feq.s a0, fa5, fa5
-; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    fcvt.s.h fa5, fa0
-; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    feq.s a1, fa5, fa5
+; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    feq.s a1, fa4, fa4
 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    and a0, a1, a0
 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    ret
 ;
@@ -437,8 +437,8 @@ define i32 @fcmp_ord(half %a, half %b) nounwind {
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa4, a1
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
-; CHECKIZFHMIN-NEXT:    feq.s a0, fa4, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; CHECKIZFHMIN-NEXT:    feq.s a0, fa4, fa4
 ; CHECKIZFHMIN-NEXT:    feq.s a1, fa5, fa5
 ; CHECKIZFHMIN-NEXT:    and a0, a1, a0
 ; CHECKIZFHMIN-NEXT:    ret
@@ -446,8 +446,8 @@ define i32 @fcmp_ord(half %a, half %b) nounwind {
 ; CHECKIZHINXMIN-LABEL: fcmp_ord:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECKIZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECKIZHINXMIN-NEXT:    ret
@@ -864,9 +864,9 @@ define i32 @fcmp_uno(half %a, half %b) nounwind {
 ; CHECKIZFHMIN-ILP32F-LP64F-LABEL: fcmp_uno:
 ; CHECKIZFHMIN-ILP32F-LP64F:       # %bb.0:
 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    fcvt.s.h fa5, fa1
+; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    fcvt.s.h fa4, fa0
 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    feq.s a0, fa5, fa5
-; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    fcvt.s.h fa5, fa0
-; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    feq.s a1, fa5, fa5
+; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    feq.s a1, fa4, fa4
 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    and a0, a1, a0
 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    xori a0, a0, 1
 ; CHECKIZFHMIN-ILP32F-LP64F-NEXT:    ret
@@ -876,8 +876,8 @@ define i32 @fcmp_uno(half %a, half %b) nounwind {
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa5, a0
 ; CHECKIZFHMIN-NEXT:    fmv.h.x fa4, a1
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa4
-; CHECKIZFHMIN-NEXT:    feq.s a0, fa4, fa4
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa5
+; CHECKIZFHMIN-NEXT:    feq.s a0, fa4, fa4
 ; CHECKIZFHMIN-NEXT:    feq.s a1, fa5, fa5
 ; CHECKIZFHMIN-NEXT:    and a0, a1, a0
 ; CHECKIZFHMIN-NEXT:    xori a0, a0, 1
@@ -886,8 +886,8 @@ define i32 @fcmp_uno(half %a, half %b) nounwind {
 ; CHECKIZHINXMIN-LABEL: fcmp_uno:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
-; CHECKIZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    feq.s a1, a1, a1
 ; CHECKIZHINXMIN-NEXT:    feq.s a0, a0, a0
 ; CHECKIZHINXMIN-NEXT:    and a0, a0, a1
 ; CHECKIZHINXMIN-NEXT:    xori a0, a0, 1

diff  --git a/llvm/test/CodeGen/RISCV/half-intrinsics.ll b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
index f063c1fef4e16b..0d26e660c979bb 100644
--- a/llvm/test/CodeGen/RISCV/half-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
@@ -2050,8 +2050,8 @@ define half @copysign_f16(half %a, half %b) nounwind {
 ; RV32I-LABEL: copysign_f16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 1048568
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 17
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    srli a0, a0, 17
 ; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -2059,8 +2059,8 @@ define half @copysign_f16(half %a, half %b) nounwind {
 ; RV64I-LABEL: copysign_f16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 1048568
-; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a0, a0, 49
+; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    srli a0, a0, 49
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -2094,8 +2094,8 @@ define half @copysign_f16(half %a, half %b) nounwind {
 ; RV32IZHINXMIN-NEXT:    # kill: def $x11_h killed $x11_h def $x11
 ; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32IZHINXMIN-NEXT:    lui a2, 1048568
-; RV32IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
+; RV32IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    or a0, a0, a1
 ; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
@@ -2106,8 +2106,8 @@ define half @copysign_f16(half %a, half %b) nounwind {
 ; RV64IZHINXMIN-NEXT:    # kill: def $x11_h killed $x11_h def $x11
 ; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64IZHINXMIN-NEXT:    lui a2, 1048568
-; RV64IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
+; RV64IZHINXMIN-NEXT:    and a1, a1, a2
 ; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    or a0, a0, a1
 ; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h killed $x10
@@ -2801,8 +2801,8 @@ define i1 @isnan_d_fpclass(half %x) {
 ; RV32I-LABEL: isnan_d_fpclass:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 17
-; RV32I-NEXT:    srli a0, a0, 17
 ; RV32I-NEXT:    li a1, 31
+; RV32I-NEXT:    srli a0, a0, 17
 ; RV32I-NEXT:    slli a1, a1, 10
 ; RV32I-NEXT:    slt a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -2810,8 +2810,8 @@ define i1 @isnan_d_fpclass(half %x) {
 ; RV64I-LABEL: isnan_d_fpclass:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 49
-; RV64I-NEXT:    srli a0, a0, 49
 ; RV64I-NEXT:    li a1, 31
+; RV64I-NEXT:    srli a0, a0, 49
 ; RV64I-NEXT:    slli a1, a1, 10
 ; RV64I-NEXT:    slt a0, a1, a0
 ; RV64I-NEXT:    ret
@@ -2819,9 +2819,9 @@ define i1 @isnan_d_fpclass(half %x) {
 ; RV32IZFHMIN-LABEL: isnan_d_fpclass:
 ; RV32IZFHMIN:       # %bb.0:
 ; RV32IZFHMIN-NEXT:    fmv.x.h a0, fa0
+; RV32IZFHMIN-NEXT:    li a1, 31
 ; RV32IZFHMIN-NEXT:    slli a0, a0, 17
 ; RV32IZFHMIN-NEXT:    srli a0, a0, 17
-; RV32IZFHMIN-NEXT:    li a1, 31
 ; RV32IZFHMIN-NEXT:    slli a1, a1, 10
 ; RV32IZFHMIN-NEXT:    slt a0, a1, a0
 ; RV32IZFHMIN-NEXT:    ret
@@ -2829,9 +2829,9 @@ define i1 @isnan_d_fpclass(half %x) {
 ; RV64IZFHMIN-LABEL: isnan_d_fpclass:
 ; RV64IZFHMIN:       # %bb.0:
 ; RV64IZFHMIN-NEXT:    fmv.x.h a0, fa0
+; RV64IZFHMIN-NEXT:    li a1, 31
 ; RV64IZFHMIN-NEXT:    slli a0, a0, 49
 ; RV64IZFHMIN-NEXT:    srli a0, a0, 49
-; RV64IZFHMIN-NEXT:    li a1, 31
 ; RV64IZFHMIN-NEXT:    slli a1, a1, 10
 ; RV64IZFHMIN-NEXT:    slt a0, a1, a0
 ; RV64IZFHMIN-NEXT:    ret
@@ -2840,8 +2840,8 @@ define i1 @isnan_d_fpclass(half %x) {
 ; RV32IZHINXMIN:       # %bb.0:
 ; RV32IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV32IZHINXMIN-NEXT:    slli a0, a0, 17
-; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    li a1, 31
+; RV32IZHINXMIN-NEXT:    srli a0, a0, 17
 ; RV32IZHINXMIN-NEXT:    slli a1, a1, 10
 ; RV32IZHINXMIN-NEXT:    slt a0, a1, a0
 ; RV32IZHINXMIN-NEXT:    ret
@@ -2850,8 +2850,8 @@ define i1 @isnan_d_fpclass(half %x) {
 ; RV64IZHINXMIN:       # %bb.0:
 ; RV64IZHINXMIN-NEXT:    # kill: def $x10_h killed $x10_h def $x10
 ; RV64IZHINXMIN-NEXT:    slli a0, a0, 49
-; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    li a1, 31
+; RV64IZHINXMIN-NEXT:    srli a0, a0, 49
 ; RV64IZHINXMIN-NEXT:    slli a1, a1, 10
 ; RV64IZHINXMIN-NEXT:    slt a0, a1, a0
 ; RV64IZHINXMIN-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/half-mem.ll b/llvm/test/CodeGen/RISCV/half-mem.ll
index 5b6a94a83f94bf..9ac2a4d037f8a8 100644
--- a/llvm/test/CodeGen/RISCV/half-mem.ll
+++ b/llvm/test/CodeGen/RISCV/half-mem.ll
@@ -134,10 +134,10 @@ define half @flh_fsh_global(half %a, half %b) nounwind {
 ; CHECKIZFHMIN:       # %bb.0:
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa5, fa1
 ; CHECKIZFHMIN-NEXT:    fcvt.s.h fa4, fa0
+; CHECKIZFHMIN-NEXT:    lui a0, %hi(G)
 ; CHECKIZFHMIN-NEXT:    fadd.s fa5, fa4, fa5
+; CHECKIZFHMIN-NEXT:    flh fa4, %lo(G)(a0)
 ; CHECKIZFHMIN-NEXT:    fcvt.h.s fa0, fa5
-; CHECKIZFHMIN-NEXT:    lui a0, %hi(G)
-; CHECKIZFHMIN-NEXT:    flh fa5, %lo(G)(a0)
 ; CHECKIZFHMIN-NEXT:    addi a1, a0, %lo(G)
 ; CHECKIZFHMIN-NEXT:    fsh fa0, %lo(G)(a0)
 ; CHECKIZFHMIN-NEXT:    flh fa5, 18(a1)
@@ -148,14 +148,14 @@ define half @flh_fsh_global(half %a, half %b) nounwind {
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a1, a1
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a0, a0
+; CHECKIZHINXMIN-NEXT:    lui a2, %hi(G)
 ; CHECKIZHINXMIN-NEXT:    fadd.s a0, a0, a1
+; CHECKIZHINXMIN-NEXT:    lh zero, %lo(G)(a2)
 ; CHECKIZHINXMIN-NEXT:    fcvt.h.s a0, a0
-; CHECKIZHINXMIN-NEXT:    lui a1, %hi(G)
-; CHECKIZHINXMIN-NEXT:    lh zero, %lo(G)(a1)
-; CHECKIZHINXMIN-NEXT:    addi a2, a1, %lo(G)
-; CHECKIZHINXMIN-NEXT:    sh a0, %lo(G)(a1)
-; CHECKIZHINXMIN-NEXT:    lh zero, 18(a2)
-; CHECKIZHINXMIN-NEXT:    sh a0, 18(a2)
+; CHECKIZHINXMIN-NEXT:    addi a1, a2, %lo(G)
+; CHECKIZHINXMIN-NEXT:    sh a0, %lo(G)(a2)
+; CHECKIZHINXMIN-NEXT:    lh zero, 18(a1)
+; CHECKIZHINXMIN-NEXT:    sh a0, 18(a1)
 ; CHECKIZHINXMIN-NEXT:    ret
   %1 = fadd half %a, %b
   %2 = load volatile half, ptr @G

diff  --git a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
index 9e1a26e74d70b9..3b645bf8aef912 100644
--- a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
+++ b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll
@@ -115,7 +115,7 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a3, 524288
 ; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB1_4
 ; RV32IZFH-NEXT:  # %bb.3:
@@ -123,19 +123,19 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:  .LBB1_4:
 ; RV32IZFH-NEXT:    lui a1, %hi(.LCPI1_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI1_1)(a1)
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    beqz a3, .LBB1_6
+; RV32IZFH-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFH-NEXT:    beqz a1, .LBB1_6
 ; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:    addi a2, a3, -1
 ; RV32IZFH-NEXT:  .LBB1_6:
-; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFH-NEXT:    neg a4, a1
-; RV32IZFH-NEXT:    and a1, a4, a2
-; RV32IZFH-NEXT:    neg a2, s0
-; RV32IZFH-NEXT:    and a0, a2, a0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, s0
+; RV32IZFH-NEXT:    neg a5, a1
+; RV32IZFH-NEXT:    neg a3, a3
 ; RV32IZFH-NEXT:    and a0, a4, a0
+; RV32IZFH-NEXT:    and a1, a3, a2
+; RV32IZFH-NEXT:    or a0, a5, a0
+; RV32IZFH-NEXT:    and a0, a3, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -172,7 +172,7 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    lui a2, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB1_4
 ; RV32IZHINX-NEXT:  # %bb.3:
@@ -180,19 +180,19 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:  .LBB1_4:
 ; RV32IZHINX-NEXT:    lui a1, 389120
 ; RV32IZHINX-NEXT:    addi a1, a1, -1
-; RV32IZHINX-NEXT:    flt.s a3, a1, s0
-; RV32IZHINX-NEXT:    beqz a3, .LBB1_6
+; RV32IZHINX-NEXT:    flt.s a1, a1, s0
+; RV32IZHINX-NEXT:    beqz a1, .LBB1_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a2, a4, -1
+; RV32IZHINX-NEXT:    addi a2, a3, -1
 ; RV32IZHINX-NEXT:  .LBB1_6:
-; RV32IZHINX-NEXT:    feq.s a1, s0, s0
-; RV32IZHINX-NEXT:    neg a4, a1
-; RV32IZHINX-NEXT:    and a1, a4, a2
-; RV32IZHINX-NEXT:    neg a2, s1
-; RV32IZHINX-NEXT:    and a0, a2, a0
-; RV32IZHINX-NEXT:    neg a2, a3
-; RV32IZHINX-NEXT:    or a0, a2, a0
+; RV32IZHINX-NEXT:    feq.s a3, s0, s0
+; RV32IZHINX-NEXT:    neg a4, s1
+; RV32IZHINX-NEXT:    neg a5, a1
+; RV32IZHINX-NEXT:    neg a3, a3
 ; RV32IZHINX-NEXT:    and a0, a4, a0
+; RV32IZHINX-NEXT:    and a1, a3, a2
+; RV32IZHINX-NEXT:    or a0, a5, a0
+; RV32IZHINX-NEXT:    and a0, a3, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -236,13 +236,13 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
+; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a3, 524288
 ; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB1_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
@@ -250,19 +250,19 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:  .LBB1_4:
 ; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI1_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI1_0)(a1)
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB1_6
+; RV32IZFHMIN-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a1, .LBB1_6
 ; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:    addi a2, a3, -1
 ; RV32IZFHMIN-NEXT:  .LBB1_6:
-; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a4, a1
-; RV32IZFHMIN-NEXT:    and a1, a4, a2
-; RV32IZFHMIN-NEXT:    neg a2, s0
-; RV32IZFHMIN-NEXT:    and a0, a2, a0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, s0
+; RV32IZFHMIN-NEXT:    neg a5, a1
+; RV32IZFHMIN-NEXT:    neg a3, a3
 ; RV32IZFHMIN-NEXT:    and a0, a4, a0
+; RV32IZFHMIN-NEXT:    and a1, a3, a2
+; RV32IZFHMIN-NEXT:    or a0, a5, a0
+; RV32IZFHMIN-NEXT:    and a0, a3, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -313,7 +313,7 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixsfdi
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    lui a2, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB1_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
@@ -321,19 +321,19 @@ define i64 @test_floor_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:  .LBB1_4:
 ; RV32IZHINXMIN-NEXT:    lui a1, 389120
 ; RV32IZHINXMIN-NEXT:    addi a1, a1, -1
-; RV32IZHINXMIN-NEXT:    flt.s a3, a1, s0
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB1_6
+; RV32IZHINXMIN-NEXT:    flt.s a1, a1, s0
+; RV32IZHINXMIN-NEXT:    beqz a1, .LBB1_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a2, a4, -1
+; RV32IZHINXMIN-NEXT:    addi a2, a3, -1
 ; RV32IZHINXMIN-NEXT:  .LBB1_6:
-; RV32IZHINXMIN-NEXT:    feq.s a1, s0, s0
-; RV32IZHINXMIN-NEXT:    neg a4, a1
-; RV32IZHINXMIN-NEXT:    and a1, a4, a2
-; RV32IZHINXMIN-NEXT:    neg a2, s1
-; RV32IZHINXMIN-NEXT:    and a0, a2, a0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
-; RV32IZHINXMIN-NEXT:    or a0, a2, a0
+; RV32IZHINXMIN-NEXT:    feq.s a3, s0, s0
+; RV32IZHINXMIN-NEXT:    neg a4, s1
+; RV32IZHINXMIN-NEXT:    neg a5, a1
+; RV32IZHINXMIN-NEXT:    neg a3, a3
 ; RV32IZHINXMIN-NEXT:    and a0, a4, a0
+; RV32IZHINXMIN-NEXT:    and a1, a3, a2
+; RV32IZHINXMIN-NEXT:    or a0, a5, a0
+; RV32IZHINXMIN-NEXT:    and a0, a3, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -529,10 +529,10 @@ define i64 @test_floor_ui64(half %x) nounwind {
 ; RV32IZFH-NEXT:    lui a2, %hi(.LCPI3_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI3_1)(a2)
 ; RV32IZFH-NEXT:    and a0, s0, a0
+; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFH-NEXT:    neg a2, a2
 ; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    or a1, a2, a1
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -572,11 +572,11 @@ define i64 @test_floor_ui64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    call __fixunssfdi
 ; RV32IZHINX-NEXT:    and a0, s1, a0
 ; RV32IZHINX-NEXT:    lui a2, 391168
+; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    or a0, a2, a0
-; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    or a1, a2, a1
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -630,10 +630,10 @@ define i64 @test_floor_ui64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI3_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI3_0)(a2)
 ; RV32IZFHMIN-NEXT:    and a0, s0, a0
+; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFHMIN-NEXT:    neg a2, a2
 ; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    or a1, a2, a1
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -687,11 +687,11 @@ define i64 @test_floor_ui64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    call __fixunssfdi
 ; RV32IZHINXMIN-NEXT:    and a0, s1, a0
 ; RV32IZHINXMIN-NEXT:    lui a2, 391168
+; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
-; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    or a1, a2, a1
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -823,7 +823,7 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a3, 524288
 ; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB5_4
 ; RV32IZFH-NEXT:  # %bb.3:
@@ -831,19 +831,19 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:  .LBB5_4:
 ; RV32IZFH-NEXT:    lui a1, %hi(.LCPI5_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI5_1)(a1)
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    beqz a3, .LBB5_6
+; RV32IZFH-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFH-NEXT:    beqz a1, .LBB5_6
 ; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:    addi a2, a3, -1
 ; RV32IZFH-NEXT:  .LBB5_6:
-; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFH-NEXT:    neg a4, a1
-; RV32IZFH-NEXT:    and a1, a4, a2
-; RV32IZFH-NEXT:    neg a2, s0
-; RV32IZFH-NEXT:    and a0, a2, a0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, s0
+; RV32IZFH-NEXT:    neg a5, a1
+; RV32IZFH-NEXT:    neg a3, a3
 ; RV32IZFH-NEXT:    and a0, a4, a0
+; RV32IZFH-NEXT:    and a1, a3, a2
+; RV32IZFH-NEXT:    or a0, a5, a0
+; RV32IZFH-NEXT:    and a0, a3, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -880,7 +880,7 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    lui a2, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB5_4
 ; RV32IZHINX-NEXT:  # %bb.3:
@@ -888,19 +888,19 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:  .LBB5_4:
 ; RV32IZHINX-NEXT:    lui a1, 389120
 ; RV32IZHINX-NEXT:    addi a1, a1, -1
-; RV32IZHINX-NEXT:    flt.s a3, a1, s0
-; RV32IZHINX-NEXT:    beqz a3, .LBB5_6
+; RV32IZHINX-NEXT:    flt.s a1, a1, s0
+; RV32IZHINX-NEXT:    beqz a1, .LBB5_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a2, a4, -1
+; RV32IZHINX-NEXT:    addi a2, a3, -1
 ; RV32IZHINX-NEXT:  .LBB5_6:
-; RV32IZHINX-NEXT:    feq.s a1, s0, s0
-; RV32IZHINX-NEXT:    neg a4, a1
-; RV32IZHINX-NEXT:    and a1, a4, a2
-; RV32IZHINX-NEXT:    neg a2, s1
-; RV32IZHINX-NEXT:    and a0, a2, a0
-; RV32IZHINX-NEXT:    neg a2, a3
-; RV32IZHINX-NEXT:    or a0, a2, a0
+; RV32IZHINX-NEXT:    feq.s a3, s0, s0
+; RV32IZHINX-NEXT:    neg a4, s1
+; RV32IZHINX-NEXT:    neg a5, a1
+; RV32IZHINX-NEXT:    neg a3, a3
 ; RV32IZHINX-NEXT:    and a0, a4, a0
+; RV32IZHINX-NEXT:    and a1, a3, a2
+; RV32IZHINX-NEXT:    or a0, a5, a0
+; RV32IZHINX-NEXT:    and a0, a3, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -944,13 +944,13 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
+; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a3, 524288
 ; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB5_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
@@ -958,19 +958,19 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:  .LBB5_4:
 ; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI5_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI5_0)(a1)
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB5_6
+; RV32IZFHMIN-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a1, .LBB5_6
 ; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:    addi a2, a3, -1
 ; RV32IZFHMIN-NEXT:  .LBB5_6:
-; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a4, a1
-; RV32IZFHMIN-NEXT:    and a1, a4, a2
-; RV32IZFHMIN-NEXT:    neg a2, s0
-; RV32IZFHMIN-NEXT:    and a0, a2, a0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, s0
+; RV32IZFHMIN-NEXT:    neg a5, a1
+; RV32IZFHMIN-NEXT:    neg a3, a3
 ; RV32IZFHMIN-NEXT:    and a0, a4, a0
+; RV32IZFHMIN-NEXT:    and a1, a3, a2
+; RV32IZFHMIN-NEXT:    or a0, a5, a0
+; RV32IZFHMIN-NEXT:    and a0, a3, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -1021,7 +1021,7 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixsfdi
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    lui a2, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB5_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
@@ -1029,19 +1029,19 @@ define i64 @test_ceil_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:  .LBB5_4:
 ; RV32IZHINXMIN-NEXT:    lui a1, 389120
 ; RV32IZHINXMIN-NEXT:    addi a1, a1, -1
-; RV32IZHINXMIN-NEXT:    flt.s a3, a1, s0
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB5_6
+; RV32IZHINXMIN-NEXT:    flt.s a1, a1, s0
+; RV32IZHINXMIN-NEXT:    beqz a1, .LBB5_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a2, a4, -1
+; RV32IZHINXMIN-NEXT:    addi a2, a3, -1
 ; RV32IZHINXMIN-NEXT:  .LBB5_6:
-; RV32IZHINXMIN-NEXT:    feq.s a1, s0, s0
-; RV32IZHINXMIN-NEXT:    neg a4, a1
-; RV32IZHINXMIN-NEXT:    and a1, a4, a2
-; RV32IZHINXMIN-NEXT:    neg a2, s1
-; RV32IZHINXMIN-NEXT:    and a0, a2, a0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
-; RV32IZHINXMIN-NEXT:    or a0, a2, a0
+; RV32IZHINXMIN-NEXT:    feq.s a3, s0, s0
+; RV32IZHINXMIN-NEXT:    neg a4, s1
+; RV32IZHINXMIN-NEXT:    neg a5, a1
+; RV32IZHINXMIN-NEXT:    neg a3, a3
 ; RV32IZHINXMIN-NEXT:    and a0, a4, a0
+; RV32IZHINXMIN-NEXT:    and a1, a3, a2
+; RV32IZHINXMIN-NEXT:    or a0, a5, a0
+; RV32IZHINXMIN-NEXT:    and a0, a3, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1237,10 +1237,10 @@ define i64 @test_ceil_ui64(half %x) nounwind {
 ; RV32IZFH-NEXT:    lui a2, %hi(.LCPI7_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI7_1)(a2)
 ; RV32IZFH-NEXT:    and a0, s0, a0
+; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFH-NEXT:    neg a2, a2
 ; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    or a1, a2, a1
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1280,11 +1280,11 @@ define i64 @test_ceil_ui64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    call __fixunssfdi
 ; RV32IZHINX-NEXT:    and a0, s1, a0
 ; RV32IZHINX-NEXT:    lui a2, 391168
+; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    or a0, a2, a0
-; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    or a1, a2, a1
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1338,10 +1338,10 @@ define i64 @test_ceil_ui64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI7_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI7_0)(a2)
 ; RV32IZFHMIN-NEXT:    and a0, s0, a0
+; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFHMIN-NEXT:    neg a2, a2
 ; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    or a1, a2, a1
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1395,11 +1395,11 @@ define i64 @test_ceil_ui64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    call __fixunssfdi
 ; RV32IZHINXMIN-NEXT:    and a0, s1, a0
 ; RV32IZHINXMIN-NEXT:    lui a2, 391168
+; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
-; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    or a1, a2, a1
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1531,7 +1531,7 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a3, 524288
 ; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB9_4
 ; RV32IZFH-NEXT:  # %bb.3:
@@ -1539,19 +1539,19 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:  .LBB9_4:
 ; RV32IZFH-NEXT:    lui a1, %hi(.LCPI9_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI9_1)(a1)
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    beqz a3, .LBB9_6
+; RV32IZFH-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFH-NEXT:    beqz a1, .LBB9_6
 ; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:    addi a2, a3, -1
 ; RV32IZFH-NEXT:  .LBB9_6:
-; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFH-NEXT:    neg a4, a1
-; RV32IZFH-NEXT:    and a1, a4, a2
-; RV32IZFH-NEXT:    neg a2, s0
-; RV32IZFH-NEXT:    and a0, a2, a0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, s0
+; RV32IZFH-NEXT:    neg a5, a1
+; RV32IZFH-NEXT:    neg a3, a3
 ; RV32IZFH-NEXT:    and a0, a4, a0
+; RV32IZFH-NEXT:    and a1, a3, a2
+; RV32IZFH-NEXT:    or a0, a5, a0
+; RV32IZFH-NEXT:    and a0, a3, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -1588,7 +1588,7 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    lui a2, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB9_4
 ; RV32IZHINX-NEXT:  # %bb.3:
@@ -1596,19 +1596,19 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:  .LBB9_4:
 ; RV32IZHINX-NEXT:    lui a1, 389120
 ; RV32IZHINX-NEXT:    addi a1, a1, -1
-; RV32IZHINX-NEXT:    flt.s a3, a1, s0
-; RV32IZHINX-NEXT:    beqz a3, .LBB9_6
+; RV32IZHINX-NEXT:    flt.s a1, a1, s0
+; RV32IZHINX-NEXT:    beqz a1, .LBB9_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a2, a4, -1
+; RV32IZHINX-NEXT:    addi a2, a3, -1
 ; RV32IZHINX-NEXT:  .LBB9_6:
-; RV32IZHINX-NEXT:    feq.s a1, s0, s0
-; RV32IZHINX-NEXT:    neg a4, a1
-; RV32IZHINX-NEXT:    and a1, a4, a2
-; RV32IZHINX-NEXT:    neg a2, s1
-; RV32IZHINX-NEXT:    and a0, a2, a0
-; RV32IZHINX-NEXT:    neg a2, a3
-; RV32IZHINX-NEXT:    or a0, a2, a0
+; RV32IZHINX-NEXT:    feq.s a3, s0, s0
+; RV32IZHINX-NEXT:    neg a4, s1
+; RV32IZHINX-NEXT:    neg a5, a1
+; RV32IZHINX-NEXT:    neg a3, a3
 ; RV32IZHINX-NEXT:    and a0, a4, a0
+; RV32IZHINX-NEXT:    and a1, a3, a2
+; RV32IZHINX-NEXT:    or a0, a5, a0
+; RV32IZHINX-NEXT:    and a0, a3, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1652,13 +1652,13 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
+; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a3, 524288
 ; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB9_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
@@ -1666,19 +1666,19 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:  .LBB9_4:
 ; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI9_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI9_0)(a1)
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB9_6
+; RV32IZFHMIN-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a1, .LBB9_6
 ; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:    addi a2, a3, -1
 ; RV32IZFHMIN-NEXT:  .LBB9_6:
-; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a4, a1
-; RV32IZFHMIN-NEXT:    and a1, a4, a2
-; RV32IZFHMIN-NEXT:    neg a2, s0
-; RV32IZFHMIN-NEXT:    and a0, a2, a0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, s0
+; RV32IZFHMIN-NEXT:    neg a5, a1
+; RV32IZFHMIN-NEXT:    neg a3, a3
 ; RV32IZFHMIN-NEXT:    and a0, a4, a0
+; RV32IZFHMIN-NEXT:    and a1, a3, a2
+; RV32IZFHMIN-NEXT:    or a0, a5, a0
+; RV32IZFHMIN-NEXT:    and a0, a3, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -1729,7 +1729,7 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixsfdi
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    lui a2, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB9_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
@@ -1737,19 +1737,19 @@ define i64 @test_trunc_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:  .LBB9_4:
 ; RV32IZHINXMIN-NEXT:    lui a1, 389120
 ; RV32IZHINXMIN-NEXT:    addi a1, a1, -1
-; RV32IZHINXMIN-NEXT:    flt.s a3, a1, s0
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB9_6
+; RV32IZHINXMIN-NEXT:    flt.s a1, a1, s0
+; RV32IZHINXMIN-NEXT:    beqz a1, .LBB9_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a2, a4, -1
+; RV32IZHINXMIN-NEXT:    addi a2, a3, -1
 ; RV32IZHINXMIN-NEXT:  .LBB9_6:
-; RV32IZHINXMIN-NEXT:    feq.s a1, s0, s0
-; RV32IZHINXMIN-NEXT:    neg a4, a1
-; RV32IZHINXMIN-NEXT:    and a1, a4, a2
-; RV32IZHINXMIN-NEXT:    neg a2, s1
-; RV32IZHINXMIN-NEXT:    and a0, a2, a0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
-; RV32IZHINXMIN-NEXT:    or a0, a2, a0
+; RV32IZHINXMIN-NEXT:    feq.s a3, s0, s0
+; RV32IZHINXMIN-NEXT:    neg a4, s1
+; RV32IZHINXMIN-NEXT:    neg a5, a1
+; RV32IZHINXMIN-NEXT:    neg a3, a3
 ; RV32IZHINXMIN-NEXT:    and a0, a4, a0
+; RV32IZHINXMIN-NEXT:    and a1, a3, a2
+; RV32IZHINXMIN-NEXT:    or a0, a5, a0
+; RV32IZHINXMIN-NEXT:    and a0, a3, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -1945,10 +1945,10 @@ define i64 @test_trunc_ui64(half %x) nounwind {
 ; RV32IZFH-NEXT:    lui a2, %hi(.LCPI11_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI11_1)(a2)
 ; RV32IZFH-NEXT:    and a0, s0, a0
+; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFH-NEXT:    neg a2, a2
 ; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    or a1, a2, a1
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -1988,11 +1988,11 @@ define i64 @test_trunc_ui64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    call __fixunssfdi
 ; RV32IZHINX-NEXT:    and a0, s1, a0
 ; RV32IZHINX-NEXT:    lui a2, 391168
+; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    or a0, a2, a0
-; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    or a1, a2, a1
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -2046,10 +2046,10 @@ define i64 @test_trunc_ui64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI11_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI11_0)(a2)
 ; RV32IZFHMIN-NEXT:    and a0, s0, a0
+; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFHMIN-NEXT:    neg a2, a2
 ; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    or a1, a2, a1
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -2103,11 +2103,11 @@ define i64 @test_trunc_ui64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    call __fixunssfdi
 ; RV32IZHINXMIN-NEXT:    and a0, s1, a0
 ; RV32IZHINXMIN-NEXT:    lui a2, 391168
+; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
-; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    or a1, a2, a1
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -2239,7 +2239,7 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a3, 524288
 ; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB13_4
 ; RV32IZFH-NEXT:  # %bb.3:
@@ -2247,19 +2247,19 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:  .LBB13_4:
 ; RV32IZFH-NEXT:    lui a1, %hi(.LCPI13_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI13_1)(a1)
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    beqz a3, .LBB13_6
+; RV32IZFH-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFH-NEXT:    beqz a1, .LBB13_6
 ; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:    addi a2, a3, -1
 ; RV32IZFH-NEXT:  .LBB13_6:
-; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFH-NEXT:    neg a4, a1
-; RV32IZFH-NEXT:    and a1, a4, a2
-; RV32IZFH-NEXT:    neg a2, s0
-; RV32IZFH-NEXT:    and a0, a2, a0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, s0
+; RV32IZFH-NEXT:    neg a5, a1
+; RV32IZFH-NEXT:    neg a3, a3
 ; RV32IZFH-NEXT:    and a0, a4, a0
+; RV32IZFH-NEXT:    and a1, a3, a2
+; RV32IZFH-NEXT:    or a0, a5, a0
+; RV32IZFH-NEXT:    and a0, a3, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -2296,7 +2296,7 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    lui a2, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB13_4
 ; RV32IZHINX-NEXT:  # %bb.3:
@@ -2304,19 +2304,19 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:  .LBB13_4:
 ; RV32IZHINX-NEXT:    lui a1, 389120
 ; RV32IZHINX-NEXT:    addi a1, a1, -1
-; RV32IZHINX-NEXT:    flt.s a3, a1, s0
-; RV32IZHINX-NEXT:    beqz a3, .LBB13_6
+; RV32IZHINX-NEXT:    flt.s a1, a1, s0
+; RV32IZHINX-NEXT:    beqz a1, .LBB13_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a2, a4, -1
+; RV32IZHINX-NEXT:    addi a2, a3, -1
 ; RV32IZHINX-NEXT:  .LBB13_6:
-; RV32IZHINX-NEXT:    feq.s a1, s0, s0
-; RV32IZHINX-NEXT:    neg a4, a1
-; RV32IZHINX-NEXT:    and a1, a4, a2
-; RV32IZHINX-NEXT:    neg a2, s1
-; RV32IZHINX-NEXT:    and a0, a2, a0
-; RV32IZHINX-NEXT:    neg a2, a3
-; RV32IZHINX-NEXT:    or a0, a2, a0
+; RV32IZHINX-NEXT:    feq.s a3, s0, s0
+; RV32IZHINX-NEXT:    neg a4, s1
+; RV32IZHINX-NEXT:    neg a5, a1
+; RV32IZHINX-NEXT:    neg a3, a3
 ; RV32IZHINX-NEXT:    and a0, a4, a0
+; RV32IZHINX-NEXT:    and a1, a3, a2
+; RV32IZHINX-NEXT:    or a0, a5, a0
+; RV32IZHINX-NEXT:    and a0, a3, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -2360,13 +2360,13 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
+; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a3, 524288
 ; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB13_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
@@ -2374,19 +2374,19 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:  .LBB13_4:
 ; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI13_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI13_0)(a1)
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB13_6
+; RV32IZFHMIN-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a1, .LBB13_6
 ; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:    addi a2, a3, -1
 ; RV32IZFHMIN-NEXT:  .LBB13_6:
-; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a4, a1
-; RV32IZFHMIN-NEXT:    and a1, a4, a2
-; RV32IZFHMIN-NEXT:    neg a2, s0
-; RV32IZFHMIN-NEXT:    and a0, a2, a0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, s0
+; RV32IZFHMIN-NEXT:    neg a5, a1
+; RV32IZFHMIN-NEXT:    neg a3, a3
 ; RV32IZFHMIN-NEXT:    and a0, a4, a0
+; RV32IZFHMIN-NEXT:    and a1, a3, a2
+; RV32IZFHMIN-NEXT:    or a0, a5, a0
+; RV32IZFHMIN-NEXT:    and a0, a3, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -2437,7 +2437,7 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixsfdi
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    lui a2, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB13_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
@@ -2445,19 +2445,19 @@ define i64 @test_round_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:  .LBB13_4:
 ; RV32IZHINXMIN-NEXT:    lui a1, 389120
 ; RV32IZHINXMIN-NEXT:    addi a1, a1, -1
-; RV32IZHINXMIN-NEXT:    flt.s a3, a1, s0
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB13_6
+; RV32IZHINXMIN-NEXT:    flt.s a1, a1, s0
+; RV32IZHINXMIN-NEXT:    beqz a1, .LBB13_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a2, a4, -1
+; RV32IZHINXMIN-NEXT:    addi a2, a3, -1
 ; RV32IZHINXMIN-NEXT:  .LBB13_6:
-; RV32IZHINXMIN-NEXT:    feq.s a1, s0, s0
-; RV32IZHINXMIN-NEXT:    neg a4, a1
-; RV32IZHINXMIN-NEXT:    and a1, a4, a2
-; RV32IZHINXMIN-NEXT:    neg a2, s1
-; RV32IZHINXMIN-NEXT:    and a0, a2, a0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
-; RV32IZHINXMIN-NEXT:    or a0, a2, a0
+; RV32IZHINXMIN-NEXT:    feq.s a3, s0, s0
+; RV32IZHINXMIN-NEXT:    neg a4, s1
+; RV32IZHINXMIN-NEXT:    neg a5, a1
+; RV32IZHINXMIN-NEXT:    neg a3, a3
 ; RV32IZHINXMIN-NEXT:    and a0, a4, a0
+; RV32IZHINXMIN-NEXT:    and a1, a3, a2
+; RV32IZHINXMIN-NEXT:    or a0, a5, a0
+; RV32IZHINXMIN-NEXT:    and a0, a3, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -2653,10 +2653,10 @@ define i64 @test_round_ui64(half %x) nounwind {
 ; RV32IZFH-NEXT:    lui a2, %hi(.LCPI15_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI15_1)(a2)
 ; RV32IZFH-NEXT:    and a0, s0, a0
+; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFH-NEXT:    neg a2, a2
 ; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    or a1, a2, a1
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -2696,11 +2696,11 @@ define i64 @test_round_ui64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    call __fixunssfdi
 ; RV32IZHINX-NEXT:    and a0, s1, a0
 ; RV32IZHINX-NEXT:    lui a2, 391168
+; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    or a0, a2, a0
-; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    or a1, a2, a1
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -2754,10 +2754,10 @@ define i64 @test_round_ui64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI15_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI15_0)(a2)
 ; RV32IZFHMIN-NEXT:    and a0, s0, a0
+; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFHMIN-NEXT:    neg a2, a2
 ; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    or a1, a2, a1
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -2811,11 +2811,11 @@ define i64 @test_round_ui64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    call __fixunssfdi
 ; RV32IZHINXMIN-NEXT:    and a0, s1, a0
 ; RV32IZHINXMIN-NEXT:    lui a2, 391168
+; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
-; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    or a1, a2, a1
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -2947,7 +2947,7 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a3, 524288
 ; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB17_4
 ; RV32IZFH-NEXT:  # %bb.3:
@@ -2955,19 +2955,19 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:  .LBB17_4:
 ; RV32IZFH-NEXT:    lui a1, %hi(.LCPI17_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI17_1)(a1)
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    beqz a3, .LBB17_6
+; RV32IZFH-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFH-NEXT:    beqz a1, .LBB17_6
 ; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:    addi a2, a3, -1
 ; RV32IZFH-NEXT:  .LBB17_6:
-; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFH-NEXT:    neg a4, a1
-; RV32IZFH-NEXT:    and a1, a4, a2
-; RV32IZFH-NEXT:    neg a2, s0
-; RV32IZFH-NEXT:    and a0, a2, a0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, s0
+; RV32IZFH-NEXT:    neg a5, a1
+; RV32IZFH-NEXT:    neg a3, a3
 ; RV32IZFH-NEXT:    and a0, a4, a0
+; RV32IZFH-NEXT:    and a1, a3, a2
+; RV32IZFH-NEXT:    or a0, a5, a0
+; RV32IZFH-NEXT:    and a0, a3, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -3004,7 +3004,7 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    lui a2, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB17_4
 ; RV32IZHINX-NEXT:  # %bb.3:
@@ -3012,19 +3012,19 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:  .LBB17_4:
 ; RV32IZHINX-NEXT:    lui a1, 389120
 ; RV32IZHINX-NEXT:    addi a1, a1, -1
-; RV32IZHINX-NEXT:    flt.s a3, a1, s0
-; RV32IZHINX-NEXT:    beqz a3, .LBB17_6
+; RV32IZHINX-NEXT:    flt.s a1, a1, s0
+; RV32IZHINX-NEXT:    beqz a1, .LBB17_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a2, a4, -1
+; RV32IZHINX-NEXT:    addi a2, a3, -1
 ; RV32IZHINX-NEXT:  .LBB17_6:
-; RV32IZHINX-NEXT:    feq.s a1, s0, s0
-; RV32IZHINX-NEXT:    neg a4, a1
-; RV32IZHINX-NEXT:    and a1, a4, a2
-; RV32IZHINX-NEXT:    neg a2, s1
-; RV32IZHINX-NEXT:    and a0, a2, a0
-; RV32IZHINX-NEXT:    neg a2, a3
-; RV32IZHINX-NEXT:    or a0, a2, a0
+; RV32IZHINX-NEXT:    feq.s a3, s0, s0
+; RV32IZHINX-NEXT:    neg a4, s1
+; RV32IZHINX-NEXT:    neg a5, a1
+; RV32IZHINX-NEXT:    neg a3, a3
 ; RV32IZHINX-NEXT:    and a0, a4, a0
+; RV32IZHINX-NEXT:    and a1, a3, a2
+; RV32IZHINX-NEXT:    or a0, a5, a0
+; RV32IZHINX-NEXT:    and a0, a3, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -3068,13 +3068,13 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
+; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a3, 524288
 ; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB17_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
@@ -3082,19 +3082,19 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:  .LBB17_4:
 ; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI17_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI17_0)(a1)
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB17_6
+; RV32IZFHMIN-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a1, .LBB17_6
 ; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:    addi a2, a3, -1
 ; RV32IZFHMIN-NEXT:  .LBB17_6:
-; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a4, a1
-; RV32IZFHMIN-NEXT:    and a1, a4, a2
-; RV32IZFHMIN-NEXT:    neg a2, s0
-; RV32IZFHMIN-NEXT:    and a0, a2, a0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, s0
+; RV32IZFHMIN-NEXT:    neg a5, a1
+; RV32IZFHMIN-NEXT:    neg a3, a3
 ; RV32IZFHMIN-NEXT:    and a0, a4, a0
+; RV32IZFHMIN-NEXT:    and a1, a3, a2
+; RV32IZFHMIN-NEXT:    or a0, a5, a0
+; RV32IZFHMIN-NEXT:    and a0, a3, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -3145,7 +3145,7 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixsfdi
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    lui a2, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB17_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
@@ -3153,19 +3153,19 @@ define i64 @test_roundeven_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:  .LBB17_4:
 ; RV32IZHINXMIN-NEXT:    lui a1, 389120
 ; RV32IZHINXMIN-NEXT:    addi a1, a1, -1
-; RV32IZHINXMIN-NEXT:    flt.s a3, a1, s0
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB17_6
+; RV32IZHINXMIN-NEXT:    flt.s a1, a1, s0
+; RV32IZHINXMIN-NEXT:    beqz a1, .LBB17_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a2, a4, -1
+; RV32IZHINXMIN-NEXT:    addi a2, a3, -1
 ; RV32IZHINXMIN-NEXT:  .LBB17_6:
-; RV32IZHINXMIN-NEXT:    feq.s a1, s0, s0
-; RV32IZHINXMIN-NEXT:    neg a4, a1
-; RV32IZHINXMIN-NEXT:    and a1, a4, a2
-; RV32IZHINXMIN-NEXT:    neg a2, s1
-; RV32IZHINXMIN-NEXT:    and a0, a2, a0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
-; RV32IZHINXMIN-NEXT:    or a0, a2, a0
+; RV32IZHINXMIN-NEXT:    feq.s a3, s0, s0
+; RV32IZHINXMIN-NEXT:    neg a4, s1
+; RV32IZHINXMIN-NEXT:    neg a5, a1
+; RV32IZHINXMIN-NEXT:    neg a3, a3
 ; RV32IZHINXMIN-NEXT:    and a0, a4, a0
+; RV32IZHINXMIN-NEXT:    and a1, a3, a2
+; RV32IZHINXMIN-NEXT:    or a0, a5, a0
+; RV32IZHINXMIN-NEXT:    and a0, a3, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -3361,10 +3361,10 @@ define i64 @test_roundeven_ui64(half %x) nounwind {
 ; RV32IZFH-NEXT:    lui a2, %hi(.LCPI19_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI19_1)(a2)
 ; RV32IZFH-NEXT:    and a0, s0, a0
+; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFH-NEXT:    neg a2, a2
 ; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    or a1, a2, a1
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3404,11 +3404,11 @@ define i64 @test_roundeven_ui64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    call __fixunssfdi
 ; RV32IZHINX-NEXT:    and a0, s1, a0
 ; RV32IZHINX-NEXT:    lui a2, 391168
+; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    or a0, a2, a0
-; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    or a1, a2, a1
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3462,10 +3462,10 @@ define i64 @test_roundeven_ui64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI19_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI19_0)(a2)
 ; RV32IZFHMIN-NEXT:    and a0, s0, a0
+; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFHMIN-NEXT:    neg a2, a2
 ; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    or a1, a2, a1
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3519,11 +3519,11 @@ define i64 @test_roundeven_ui64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    call __fixunssfdi
 ; RV32IZHINXMIN-NEXT:    and a0, s1, a0
 ; RV32IZHINXMIN-NEXT:    lui a2, 391168
+; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
-; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    or a1, a2, a1
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -3655,7 +3655,7 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFH-NEXT:    fmv.s fa0, fs0
 ; RV32IZFH-NEXT:    call __fixsfdi
-; RV32IZFH-NEXT:    lui a4, 524288
+; RV32IZFH-NEXT:    lui a3, 524288
 ; RV32IZFH-NEXT:    lui a2, 524288
 ; RV32IZFH-NEXT:    beqz s0, .LBB21_4
 ; RV32IZFH-NEXT:  # %bb.3:
@@ -3663,19 +3663,19 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZFH-NEXT:  .LBB21_4:
 ; RV32IZFH-NEXT:    lui a1, %hi(.LCPI21_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI21_1)(a1)
-; RV32IZFH-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFH-NEXT:    beqz a3, .LBB21_6
+; RV32IZFH-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFH-NEXT:    beqz a1, .LBB21_6
 ; RV32IZFH-NEXT:  # %bb.5:
-; RV32IZFH-NEXT:    addi a2, a4, -1
+; RV32IZFH-NEXT:    addi a2, a3, -1
 ; RV32IZFH-NEXT:  .LBB21_6:
-; RV32IZFH-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFH-NEXT:    neg a4, a1
-; RV32IZFH-NEXT:    and a1, a4, a2
-; RV32IZFH-NEXT:    neg a2, s0
-; RV32IZFH-NEXT:    and a0, a2, a0
-; RV32IZFH-NEXT:    neg a2, a3
-; RV32IZFH-NEXT:    or a0, a2, a0
+; RV32IZFH-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFH-NEXT:    neg a4, s0
+; RV32IZFH-NEXT:    neg a5, a1
+; RV32IZFH-NEXT:    neg a3, a3
 ; RV32IZFH-NEXT:    and a0, a4, a0
+; RV32IZFH-NEXT:    and a1, a3, a2
+; RV32IZFH-NEXT:    or a0, a5, a0
+; RV32IZFH-NEXT:    and a0, a3, a0
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -3712,7 +3712,7 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINX-NEXT:    mv a0, s0
 ; RV32IZHINX-NEXT:    call __fixsfdi
-; RV32IZHINX-NEXT:    lui a4, 524288
+; RV32IZHINX-NEXT:    lui a3, 524288
 ; RV32IZHINX-NEXT:    lui a2, 524288
 ; RV32IZHINX-NEXT:    beqz s1, .LBB21_4
 ; RV32IZHINX-NEXT:  # %bb.3:
@@ -3720,19 +3720,19 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZHINX-NEXT:  .LBB21_4:
 ; RV32IZHINX-NEXT:    lui a1, 389120
 ; RV32IZHINX-NEXT:    addi a1, a1, -1
-; RV32IZHINX-NEXT:    flt.s a3, a1, s0
-; RV32IZHINX-NEXT:    beqz a3, .LBB21_6
+; RV32IZHINX-NEXT:    flt.s a1, a1, s0
+; RV32IZHINX-NEXT:    beqz a1, .LBB21_6
 ; RV32IZHINX-NEXT:  # %bb.5:
-; RV32IZHINX-NEXT:    addi a2, a4, -1
+; RV32IZHINX-NEXT:    addi a2, a3, -1
 ; RV32IZHINX-NEXT:  .LBB21_6:
-; RV32IZHINX-NEXT:    feq.s a1, s0, s0
-; RV32IZHINX-NEXT:    neg a4, a1
-; RV32IZHINX-NEXT:    and a1, a4, a2
-; RV32IZHINX-NEXT:    neg a2, s1
-; RV32IZHINX-NEXT:    and a0, a2, a0
-; RV32IZHINX-NEXT:    neg a2, a3
-; RV32IZHINX-NEXT:    or a0, a2, a0
+; RV32IZHINX-NEXT:    feq.s a3, s0, s0
+; RV32IZHINX-NEXT:    neg a4, s1
+; RV32IZHINX-NEXT:    neg a5, a1
+; RV32IZHINX-NEXT:    neg a3, a3
 ; RV32IZHINX-NEXT:    and a0, a4, a0
+; RV32IZHINX-NEXT:    and a1, a3, a2
+; RV32IZHINX-NEXT:    or a0, a5, a0
+; RV32IZHINX-NEXT:    and a0, a3, a0
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -3776,13 +3776,13 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fsw fs0, 4(sp) # 4-byte Folded Spill
 ; RV32IZFHMIN-NEXT:    fcvt.h.s fa5, fa5
-; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    lui a0, 913408
+; RV32IZFHMIN-NEXT:    fcvt.s.h fs0, fa5
 ; RV32IZFHMIN-NEXT:    fmv.w.x fa5, a0
 ; RV32IZFHMIN-NEXT:    fle.s s0, fa5, fs0
 ; RV32IZFHMIN-NEXT:    fmv.s fa0, fs0
 ; RV32IZFHMIN-NEXT:    call __fixsfdi
-; RV32IZFHMIN-NEXT:    lui a4, 524288
+; RV32IZFHMIN-NEXT:    lui a3, 524288
 ; RV32IZFHMIN-NEXT:    lui a2, 524288
 ; RV32IZFHMIN-NEXT:    beqz s0, .LBB21_4
 ; RV32IZFHMIN-NEXT:  # %bb.3:
@@ -3790,19 +3790,19 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:  .LBB21_4:
 ; RV32IZFHMIN-NEXT:    lui a1, %hi(.LCPI21_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI21_0)(a1)
-; RV32IZFHMIN-NEXT:    flt.s a3, fa5, fs0
-; RV32IZFHMIN-NEXT:    beqz a3, .LBB21_6
+; RV32IZFHMIN-NEXT:    flt.s a1, fa5, fs0
+; RV32IZFHMIN-NEXT:    beqz a1, .LBB21_6
 ; RV32IZFHMIN-NEXT:  # %bb.5:
-; RV32IZFHMIN-NEXT:    addi a2, a4, -1
+; RV32IZFHMIN-NEXT:    addi a2, a3, -1
 ; RV32IZFHMIN-NEXT:  .LBB21_6:
-; RV32IZFHMIN-NEXT:    feq.s a1, fs0, fs0
-; RV32IZFHMIN-NEXT:    neg a4, a1
-; RV32IZFHMIN-NEXT:    and a1, a4, a2
-; RV32IZFHMIN-NEXT:    neg a2, s0
-; RV32IZFHMIN-NEXT:    and a0, a2, a0
-; RV32IZFHMIN-NEXT:    neg a2, a3
-; RV32IZFHMIN-NEXT:    or a0, a2, a0
+; RV32IZFHMIN-NEXT:    feq.s a3, fs0, fs0
+; RV32IZFHMIN-NEXT:    neg a4, s0
+; RV32IZFHMIN-NEXT:    neg a5, a1
+; RV32IZFHMIN-NEXT:    neg a3, a3
 ; RV32IZFHMIN-NEXT:    and a0, a4, a0
+; RV32IZFHMIN-NEXT:    and a1, a3, a2
+; RV32IZFHMIN-NEXT:    or a0, a5, a0
+; RV32IZFHMIN-NEXT:    and a0, a3, a0
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    flw fs0, 4(sp) # 4-byte Folded Reload
@@ -3853,7 +3853,7 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    fle.s s1, a0, s0
 ; RV32IZHINXMIN-NEXT:    mv a0, s0
 ; RV32IZHINXMIN-NEXT:    call __fixsfdi
-; RV32IZHINXMIN-NEXT:    lui a4, 524288
+; RV32IZHINXMIN-NEXT:    lui a3, 524288
 ; RV32IZHINXMIN-NEXT:    lui a2, 524288
 ; RV32IZHINXMIN-NEXT:    beqz s1, .LBB21_4
 ; RV32IZHINXMIN-NEXT:  # %bb.3:
@@ -3861,19 +3861,19 @@ define i64 @test_rint_si64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:  .LBB21_4:
 ; RV32IZHINXMIN-NEXT:    lui a1, 389120
 ; RV32IZHINXMIN-NEXT:    addi a1, a1, -1
-; RV32IZHINXMIN-NEXT:    flt.s a3, a1, s0
-; RV32IZHINXMIN-NEXT:    beqz a3, .LBB21_6
+; RV32IZHINXMIN-NEXT:    flt.s a1, a1, s0
+; RV32IZHINXMIN-NEXT:    beqz a1, .LBB21_6
 ; RV32IZHINXMIN-NEXT:  # %bb.5:
-; RV32IZHINXMIN-NEXT:    addi a2, a4, -1
+; RV32IZHINXMIN-NEXT:    addi a2, a3, -1
 ; RV32IZHINXMIN-NEXT:  .LBB21_6:
-; RV32IZHINXMIN-NEXT:    feq.s a1, s0, s0
-; RV32IZHINXMIN-NEXT:    neg a4, a1
-; RV32IZHINXMIN-NEXT:    and a1, a4, a2
-; RV32IZHINXMIN-NEXT:    neg a2, s1
-; RV32IZHINXMIN-NEXT:    and a0, a2, a0
-; RV32IZHINXMIN-NEXT:    neg a2, a3
-; RV32IZHINXMIN-NEXT:    or a0, a2, a0
+; RV32IZHINXMIN-NEXT:    feq.s a3, s0, s0
+; RV32IZHINXMIN-NEXT:    neg a4, s1
+; RV32IZHINXMIN-NEXT:    neg a5, a1
+; RV32IZHINXMIN-NEXT:    neg a3, a3
 ; RV32IZHINXMIN-NEXT:    and a0, a4, a0
+; RV32IZHINXMIN-NEXT:    and a1, a3, a2
+; RV32IZHINXMIN-NEXT:    or a0, a5, a0
+; RV32IZHINXMIN-NEXT:    and a0, a3, a0
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
@@ -4069,10 +4069,10 @@ define i64 @test_rint_ui64(half %x) nounwind {
 ; RV32IZFH-NEXT:    lui a2, %hi(.LCPI23_1)
 ; RV32IZFH-NEXT:    flw fa5, %lo(.LCPI23_1)(a2)
 ; RV32IZFH-NEXT:    and a0, s0, a0
+; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFH-NEXT:    neg a2, a2
 ; RV32IZFH-NEXT:    or a0, a2, a0
-; RV32IZFH-NEXT:    and a1, s0, a1
 ; RV32IZFH-NEXT:    or a1, a2, a1
 ; RV32IZFH-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFH-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -4112,11 +4112,11 @@ define i64 @test_rint_ui64(half %x) nounwind {
 ; RV32IZHINX-NEXT:    call __fixunssfdi
 ; RV32IZHINX-NEXT:    and a0, s1, a0
 ; RV32IZHINX-NEXT:    lui a2, 391168
+; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    addi a2, a2, -1
 ; RV32IZHINX-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINX-NEXT:    neg a2, a2
 ; RV32IZHINX-NEXT:    or a0, a2, a0
-; RV32IZHINX-NEXT:    and a1, s1, a1
 ; RV32IZHINX-NEXT:    or a1, a2, a1
 ; RV32IZHINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINX-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -4170,10 +4170,10 @@ define i64 @test_rint_ui64(half %x) nounwind {
 ; RV32IZFHMIN-NEXT:    lui a2, %hi(.LCPI23_0)
 ; RV32IZFHMIN-NEXT:    flw fa5, %lo(.LCPI23_0)(a2)
 ; RV32IZFHMIN-NEXT:    and a0, s0, a0
+; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    flt.s a2, fa5, fs0
 ; RV32IZFHMIN-NEXT:    neg a2, a2
 ; RV32IZFHMIN-NEXT:    or a0, a2, a0
-; RV32IZFHMIN-NEXT:    and a1, s0, a1
 ; RV32IZFHMIN-NEXT:    or a1, a2, a1
 ; RV32IZFHMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZFHMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
@@ -4227,11 +4227,11 @@ define i64 @test_rint_ui64(half %x) nounwind {
 ; RV32IZHINXMIN-NEXT:    call __fixunssfdi
 ; RV32IZHINXMIN-NEXT:    and a0, s1, a0
 ; RV32IZHINXMIN-NEXT:    lui a2, 391168
+; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    addi a2, a2, -1
 ; RV32IZHINXMIN-NEXT:    flt.s a2, a2, s0
 ; RV32IZHINXMIN-NEXT:    neg a2, a2
 ; RV32IZHINXMIN-NEXT:    or a0, a2, a0
-; RV32IZHINXMIN-NEXT:    and a1, s1, a1
 ; RV32IZHINXMIN-NEXT:    or a1, a2, a1
 ; RV32IZHINXMIN-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32IZHINXMIN-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/half-select-fcmp.ll b/llvm/test/CodeGen/RISCV/half-select-fcmp.ll
index b793c500fc397b..d92dcb9eac4c61 100644
--- a/llvm/test/CodeGen/RISCV/half-select-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/half-select-fcmp.ll
@@ -358,8 +358,8 @@ define half @select_fcmp_ord(half %a, half %b) nounwind {
 ; CHECKIZHINXMIN-LABEL: select_fcmp_ord:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a1
-; CHECKIZHINXMIN-NEXT:    feq.s a2, a2, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a3, a0
+; CHECKIZHINXMIN-NEXT:    feq.s a2, a2, a2
 ; CHECKIZHINXMIN-NEXT:    feq.s a3, a3, a3
 ; CHECKIZHINXMIN-NEXT:    and a2, a3, a2
 ; CHECKIZHINXMIN-NEXT:    bnez a2, .LBB7_2
@@ -689,8 +689,8 @@ define half @select_fcmp_uno(half %a, half %b) nounwind {
 ; CHECKIZHINXMIN-LABEL: select_fcmp_uno:
 ; CHECKIZHINXMIN:       # %bb.0:
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a2, a1
-; CHECKIZHINXMIN-NEXT:    feq.s a2, a2, a2
 ; CHECKIZHINXMIN-NEXT:    fcvt.s.h a3, a0
+; CHECKIZHINXMIN-NEXT:    feq.s a2, a2, a2
 ; CHECKIZHINXMIN-NEXT:    feq.s a3, a3, a3
 ; CHECKIZHINXMIN-NEXT:    and a2, a3, a2
 ; CHECKIZHINXMIN-NEXT:    beqz a2, .LBB14_2

diff  --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll
index a0c85ab4dca7f7..66cde323ce507d 100644
--- a/llvm/test/CodeGen/RISCV/iabs.ll
+++ b/llvm/test/CodeGen/RISCV/iabs.ll
@@ -301,58 +301,58 @@ define i64 @select_abs64(i64 %x) {
 define i128 @abs128(i128 %x) {
 ; RV32I-LABEL: abs128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a2, 12(a1)
-; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a3, 12(a1)
+; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a1, 8(a1)
-; RV32I-NEXT:    bgez a2, .LBB8_2
+; RV32I-NEXT:    bgez a3, .LBB8_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    neg a5, a1
 ; RV32I-NEXT:    snez a6, a4
-; RV32I-NEXT:    snez a7, a3
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    sltu t0, a5, a6
+; RV32I-NEXT:    snez a7, a2
 ; RV32I-NEXT:    snez a1, a1
-; RV32I-NEXT:    add a1, a2, a1
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a2, a1, t0
-; RV32I-NEXT:    sub a1, a5, a6
 ; RV32I-NEXT:    neg a4, a4
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    sub a4, a4, a7
-; RV32I-NEXT:    neg a3, a3
+; RV32I-NEXT:    sltu a3, a5, a6
+; RV32I-NEXT:    neg a7, a1
+; RV32I-NEXT:    sub a1, a5, a6
+; RV32I-NEXT:    sub a3, a7, a3
+; RV32I-NEXT:    neg a2, a2
 ; RV32I-NEXT:  .LBB8_2:
-; RV32I-NEXT:    sw a3, 0(a0)
+; RV32I-NEXT:    sw a2, 0(a0)
 ; RV32I-NEXT:    sw a4, 4(a0)
 ; RV32I-NEXT:    sw a1, 8(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: abs128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a2, 12(a1)
-; RV32ZBB-NEXT:    lw a3, 0(a1)
+; RV32ZBB-NEXT:    lw a3, 12(a1)
+; RV32ZBB-NEXT:    lw a2, 0(a1)
 ; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a1, 8(a1)
-; RV32ZBB-NEXT:    bgez a2, .LBB8_2
+; RV32ZBB-NEXT:    bgez a3, .LBB8_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    neg a5, a1
 ; RV32ZBB-NEXT:    snez a6, a4
-; RV32ZBB-NEXT:    snez a7, a3
-; RV32ZBB-NEXT:    or a6, a7, a6
-; RV32ZBB-NEXT:    sltu t0, a5, a6
+; RV32ZBB-NEXT:    snez a7, a2
 ; RV32ZBB-NEXT:    snez a1, a1
-; RV32ZBB-NEXT:    add a1, a2, a1
-; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a2, a1, t0
-; RV32ZBB-NEXT:    sub a1, a5, a6
 ; RV32ZBB-NEXT:    neg a4, a4
+; RV32ZBB-NEXT:    or a6, a7, a6
+; RV32ZBB-NEXT:    add a1, a3, a1
 ; RV32ZBB-NEXT:    sub a4, a4, a7
-; RV32ZBB-NEXT:    neg a3, a3
+; RV32ZBB-NEXT:    sltu a3, a5, a6
+; RV32ZBB-NEXT:    neg a7, a1
+; RV32ZBB-NEXT:    sub a1, a5, a6
+; RV32ZBB-NEXT:    sub a3, a7, a3
+; RV32ZBB-NEXT:    neg a2, a2
 ; RV32ZBB-NEXT:  .LBB8_2:
-; RV32ZBB-NEXT:    sw a3, 0(a0)
+; RV32ZBB-NEXT:    sw a2, 0(a0)
 ; RV32ZBB-NEXT:    sw a4, 4(a0)
 ; RV32ZBB-NEXT:    sw a1, 8(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64I-LABEL: abs128:
@@ -383,58 +383,58 @@ define i128 @abs128(i128 %x) {
 define i128 @select_abs128(i128 %x) {
 ; RV32I-LABEL: select_abs128:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a2, 12(a1)
-; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a3, 12(a1)
+; RV32I-NEXT:    lw a2, 0(a1)
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a1, 8(a1)
-; RV32I-NEXT:    bgez a2, .LBB9_2
+; RV32I-NEXT:    bgez a3, .LBB9_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    neg a5, a1
 ; RV32I-NEXT:    snez a6, a4
-; RV32I-NEXT:    snez a7, a3
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    sltu t0, a5, a6
+; RV32I-NEXT:    snez a7, a2
 ; RV32I-NEXT:    snez a1, a1
-; RV32I-NEXT:    add a1, a2, a1
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a2, a1, t0
-; RV32I-NEXT:    sub a1, a5, a6
 ; RV32I-NEXT:    neg a4, a4
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    add a1, a3, a1
 ; RV32I-NEXT:    sub a4, a4, a7
-; RV32I-NEXT:    neg a3, a3
+; RV32I-NEXT:    sltu a3, a5, a6
+; RV32I-NEXT:    neg a7, a1
+; RV32I-NEXT:    sub a1, a5, a6
+; RV32I-NEXT:    sub a3, a7, a3
+; RV32I-NEXT:    neg a2, a2
 ; RV32I-NEXT:  .LBB9_2:
-; RV32I-NEXT:    sw a3, 0(a0)
+; RV32I-NEXT:    sw a2, 0(a0)
 ; RV32I-NEXT:    sw a4, 4(a0)
 ; RV32I-NEXT:    sw a1, 8(a0)
-; RV32I-NEXT:    sw a2, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: select_abs128:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    lw a2, 12(a1)
-; RV32ZBB-NEXT:    lw a3, 0(a1)
+; RV32ZBB-NEXT:    lw a3, 12(a1)
+; RV32ZBB-NEXT:    lw a2, 0(a1)
 ; RV32ZBB-NEXT:    lw a4, 4(a1)
 ; RV32ZBB-NEXT:    lw a1, 8(a1)
-; RV32ZBB-NEXT:    bgez a2, .LBB9_2
+; RV32ZBB-NEXT:    bgez a3, .LBB9_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    neg a5, a1
 ; RV32ZBB-NEXT:    snez a6, a4
-; RV32ZBB-NEXT:    snez a7, a3
-; RV32ZBB-NEXT:    or a6, a7, a6
-; RV32ZBB-NEXT:    sltu t0, a5, a6
+; RV32ZBB-NEXT:    snez a7, a2
 ; RV32ZBB-NEXT:    snez a1, a1
-; RV32ZBB-NEXT:    add a1, a2, a1
-; RV32ZBB-NEXT:    neg a1, a1
-; RV32ZBB-NEXT:    sub a2, a1, t0
-; RV32ZBB-NEXT:    sub a1, a5, a6
 ; RV32ZBB-NEXT:    neg a4, a4
+; RV32ZBB-NEXT:    or a6, a7, a6
+; RV32ZBB-NEXT:    add a1, a3, a1
 ; RV32ZBB-NEXT:    sub a4, a4, a7
-; RV32ZBB-NEXT:    neg a3, a3
+; RV32ZBB-NEXT:    sltu a3, a5, a6
+; RV32ZBB-NEXT:    neg a7, a1
+; RV32ZBB-NEXT:    sub a1, a5, a6
+; RV32ZBB-NEXT:    sub a3, a7, a3
+; RV32ZBB-NEXT:    neg a2, a2
 ; RV32ZBB-NEXT:  .LBB9_2:
-; RV32ZBB-NEXT:    sw a3, 0(a0)
+; RV32ZBB-NEXT:    sw a2, 0(a0)
 ; RV32ZBB-NEXT:    sw a4, 4(a0)
 ; RV32ZBB-NEXT:    sw a1, 8(a0)
-; RV32ZBB-NEXT:    sw a2, 12(a0)
+; RV32ZBB-NEXT:    sw a3, 12(a0)
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64I-LABEL: select_abs128:

diff  --git a/llvm/test/CodeGen/RISCV/imm.ll b/llvm/test/CodeGen/RISCV/imm.ll
index 70bcb066fe4f0a..830f381b659d18 100644
--- a/llvm/test/CodeGen/RISCV/imm.ll
+++ b/llvm/test/CodeGen/RISCV/imm.ll
@@ -888,8 +888,8 @@ define i64 @imm64_8() nounwind {
 ; RV32I-LABEL: imm64_8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 633806
-; RV32I-NEXT:    addi a0, a0, -272
 ; RV32I-NEXT:    lui a1, 74565
+; RV32I-NEXT:    addi a0, a0, -272
 ; RV32I-NEXT:    addi a1, a1, 1656
 ; RV32I-NEXT:    ret
 ;
@@ -1190,8 +1190,8 @@ define i64 @imm_right_shifted_lui_1() nounwind {
 ; RV32I-LABEL: imm_right_shifted_lui_1:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 1048575
-; RV32I-NEXT:    addi a0, a0, 1
 ; RV32I-NEXT:    lui a1, 16
+; RV32I-NEXT:    addi a0, a0, 1
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    ret
 ;
@@ -1427,8 +1427,8 @@ define i64 @imm_end_2addi_1() nounwind {
 ; RV32I-LABEL: imm_end_2addi_1:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 1048575
-; RV32I-NEXT:    addi a0, a0, 2047
 ; RV32I-NEXT:    lui a1, 1048512
+; RV32I-NEXT:    addi a0, a0, 2047
 ; RV32I-NEXT:    addi a1, a1, 127
 ; RV32I-NEXT:    ret
 ;
@@ -2446,8 +2446,8 @@ define i64 @imm_neg_8798043653189() {
 ; RV32I-LABEL: imm_neg_8798043653189:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 572348
-; RV32I-NEXT:    addi a0, a0, -1093
 ; RV32I-NEXT:    lui a1, 1048575
+; RV32I-NEXT:    addi a0, a0, -1093
 ; RV32I-NEXT:    addi a1, a1, 2047
 ; RV32I-NEXT:    ret
 ;
@@ -2512,8 +2512,8 @@ define i64 @imm_9223372034904144827() {
 ; RV32I-LABEL: imm_9223372034904144827:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 572348
-; RV32I-NEXT:    addi a0, a0, -1093
 ; RV32I-NEXT:    lui a1, 524288
+; RV32I-NEXT:    addi a0, a0, -1093
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    ret
 ;
@@ -2578,8 +2578,8 @@ define i64 @imm_neg_9223354442718100411() {
 ; RV32I-LABEL: imm_neg_9223354442718100411:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 572348
-; RV32I-NEXT:    addi a0, a0, -1093
 ; RV32I-NEXT:    lui a1, 524287
+; RV32I-NEXT:    addi a0, a0, -1093
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    ret
 ;
@@ -2895,8 +2895,8 @@ define i64 @imm_12900924131259() {
 ; RV32I-LABEL: imm_12900924131259:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 765952
-; RV32I-NEXT:    addi a0, a0, 1979
 ; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:    addi a0, a0, 1979
 ; RV32I-NEXT:    addi a1, a1, -1093
 ; RV32I-NEXT:    ret
 ;
@@ -3017,8 +3017,8 @@ define i64 @imm_12900936431479() {
 ; RV32I-LABEL: imm_12900936431479:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 768955
-; RV32I-NEXT:    addi a0, a0, 1911
 ; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:    addi a0, a0, 1911
 ; RV32I-NEXT:    addi a1, a1, -1093
 ; RV32I-NEXT:    ret
 ;
@@ -3089,8 +3089,8 @@ define i64 @imm_12900918536874() {
 ; RV32I-LABEL: imm_12900918536874:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 764587
-; RV32I-NEXT:    addi a0, a0, -1366
 ; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:    addi a0, a0, -1366
 ; RV32I-NEXT:    addi a1, a1, -1093
 ; RV32I-NEXT:    ret
 ;
@@ -3161,8 +3161,8 @@ define i64 @imm_12900925247761() {
 ; RV32I-LABEL: imm_12900925247761:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 766225
-; RV32I-NEXT:    addi a0, a0, 273
 ; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:    addi a0, a0, 273
 ; RV32I-NEXT:    addi a1, a1, -1093
 ; RV32I-NEXT:    ret
 ;
@@ -4165,8 +4165,8 @@ define i64 @imm64_0xFF7FFFFF7FFFFFFE() {
 ; RV32I-LABEL: imm64_0xFF7FFFFF7FFFFFFE:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, 524288
-; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    lui a1, 1046528
+; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll b/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll
index b1afdded62d69a..d58e6fe7675da6 100644
--- a/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll
+++ b/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll
@@ -12,8 +12,8 @@ define double @constraint_f_double(double %a) nounwind {
 ; RV32F-NEXT:    addi sp, sp, -16
 ; RV32F-NEXT:    sw a0, 8(sp)
 ; RV32F-NEXT:    sw a1, 12(sp)
-; RV32F-NEXT:    fld fa5, 8(sp)
 ; RV32F-NEXT:    lui a0, %hi(gd)
+; RV32F-NEXT:    fld fa5, 8(sp)
 ; RV32F-NEXT:    fld fa4, %lo(gd)(a0)
 ; RV32F-NEXT:    #APP
 ; RV32F-NEXT:    fadd.d fa5, fa5, fa4
@@ -45,8 +45,8 @@ define double @constraint_cf_double(double %a) nounwind {
 ; RV32F-NEXT:    addi sp, sp, -16
 ; RV32F-NEXT:    sw a0, 8(sp)
 ; RV32F-NEXT:    sw a1, 12(sp)
-; RV32F-NEXT:    fld fa5, 8(sp)
 ; RV32F-NEXT:    lui a0, %hi(gd)
+; RV32F-NEXT:    fld fa5, 8(sp)
 ; RV32F-NEXT:    fld fa4, %lo(gd)(a0)
 ; RV32F-NEXT:    #APP
 ; RV32F-NEXT:    fadd.d fa5, fa5, fa4
@@ -78,8 +78,8 @@ define double @constraint_f_double_abi_name(double %a) nounwind {
 ; RV32F-NEXT:    addi sp, sp, -16
 ; RV32F-NEXT:    sw a0, 8(sp)
 ; RV32F-NEXT:    sw a1, 12(sp)
-; RV32F-NEXT:    fld fa1, 8(sp)
 ; RV32F-NEXT:    lui a0, %hi(gd)
+; RV32F-NEXT:    fld fa1, 8(sp)
 ; RV32F-NEXT:    fld fs0, %lo(gd)(a0)
 ; RV32F-NEXT:    #APP
 ; RV32F-NEXT:    fadd.d ft0, fa1, fs0

diff  --git a/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll b/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll
index 581cf8e3bf3c9e..238a0fa0b6fd72 100644
--- a/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll
+++ b/llvm/test/CodeGen/RISCV/inline-asm-d-modifier-N.ll
@@ -15,8 +15,8 @@ define double @constraint_f_double(double %a) nounwind {
 ; RV32F-NEXT:    addi sp, sp, -16
 ; RV32F-NEXT:    sw a0, 8(sp)
 ; RV32F-NEXT:    sw a1, 12(sp)
-; RV32F-NEXT:    fld fa5, 8(sp)
 ; RV32F-NEXT:    lui a0, %hi(gd)
+; RV32F-NEXT:    fld fa5, 8(sp)
 ; RV32F-NEXT:    fld fa4, %lo(gd)(a0)
 ; RV32F-NEXT:    #APP
 ; RV32F-NEXT:    .insn 0x4, 0x02000053 | (15 << 7) | (15 << 15) | (14 << 20)
@@ -48,8 +48,8 @@ define double @constraint_cf_double(double %a) nounwind {
 ; RV32F-NEXT:    addi sp, sp, -16
 ; RV32F-NEXT:    sw a0, 8(sp)
 ; RV32F-NEXT:    sw a1, 12(sp)
-; RV32F-NEXT:    fld fa5, 8(sp)
 ; RV32F-NEXT:    lui a0, %hi(gd)
+; RV32F-NEXT:    fld fa5, 8(sp)
 ; RV32F-NEXT:    fld fa4, %lo(gd)(a0)
 ; RV32F-NEXT:    #APP
 ; RV32F-NEXT:    .insn 0x4, 0x02000053 | (15 << 7) | (15 << 15) | (14 << 20)
@@ -81,8 +81,8 @@ define double @constraint_f_double_abi_name(double %a) nounwind {
 ; RV32F-NEXT:    addi sp, sp, -16
 ; RV32F-NEXT:    sw a0, 8(sp)
 ; RV32F-NEXT:    sw a1, 12(sp)
-; RV32F-NEXT:    fld fa1, 8(sp)
 ; RV32F-NEXT:    lui a0, %hi(gd)
+; RV32F-NEXT:    fld fa1, 8(sp)
 ; RV32F-NEXT:    fld fs0, %lo(gd)(a0)
 ; RV32F-NEXT:    #APP
 ; RV32F-NEXT:    .insn 0x4, 0x02000053 | (0 << 7) | (11 << 15) | (8 << 20)

diff  --git a/llvm/test/CodeGen/RISCV/interrupt-attr-nocall.ll b/llvm/test/CodeGen/RISCV/interrupt-attr-nocall.ll
index fa6ac96b57b1eb..973eb9f41f4fe3 100644
--- a/llvm/test/CodeGen/RISCV/interrupt-attr-nocall.ll
+++ b/llvm/test/CodeGen/RISCV/interrupt-attr-nocall.ll
@@ -29,8 +29,8 @@ define void @foo_i32() nounwind #0 {
 ; CHECK-RV32-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    lui a0, %hi(a)
-; CHECK-RV32-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32-NEXT:    lui a1, %hi(b)
+; CHECK-RV32-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32-NEXT:    lw a1, %lo(b)(a1)
 ; CHECK-RV32-NEXT:    add a0, a1, a0
 ; CHECK-RV32-NEXT:    lui a1, %hi(c)
@@ -46,8 +46,8 @@ define void @foo_i32() nounwind #0 {
 ; CHECK-RV32IF-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
 ; CHECK-RV32IF-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32IF-NEXT:    lui a0, %hi(a)
-; CHECK-RV32IF-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32IF-NEXT:    lui a1, %hi(b)
+; CHECK-RV32IF-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32IF-NEXT:    lw a1, %lo(b)(a1)
 ; CHECK-RV32IF-NEXT:    add a0, a1, a0
 ; CHECK-RV32IF-NEXT:    lui a1, %hi(c)
@@ -63,8 +63,8 @@ define void @foo_i32() nounwind #0 {
 ; CHECK-RV32IFD-NEXT:    sw a0, 12(sp) # 4-byte Folded Spill
 ; CHECK-RV32IFD-NEXT:    sw a1, 8(sp) # 4-byte Folded Spill
 ; CHECK-RV32IFD-NEXT:    lui a0, %hi(a)
-; CHECK-RV32IFD-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32IFD-NEXT:    lui a1, %hi(b)
+; CHECK-RV32IFD-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32IFD-NEXT:    lw a1, %lo(b)(a1)
 ; CHECK-RV32IFD-NEXT:    add a0, a1, a0
 ; CHECK-RV32IFD-NEXT:    lui a1, %hi(c)
@@ -94,8 +94,8 @@ define void @foo_fp_i32() nounwind #1 {
 ; CHECK-RV32-NEXT:    sw a1, 0(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    addi s0, sp, 16
 ; CHECK-RV32-NEXT:    lui a0, %hi(a)
-; CHECK-RV32-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32-NEXT:    lui a1, %hi(b)
+; CHECK-RV32-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32-NEXT:    lw a1, %lo(b)(a1)
 ; CHECK-RV32-NEXT:    add a0, a1, a0
 ; CHECK-RV32-NEXT:    lui a1, %hi(c)
@@ -116,8 +116,8 @@ define void @foo_fp_i32() nounwind #1 {
 ; CHECK-RV32IF-NEXT:    sw a1, 0(sp) # 4-byte Folded Spill
 ; CHECK-RV32IF-NEXT:    addi s0, sp, 16
 ; CHECK-RV32IF-NEXT:    lui a0, %hi(a)
-; CHECK-RV32IF-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32IF-NEXT:    lui a1, %hi(b)
+; CHECK-RV32IF-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32IF-NEXT:    lw a1, %lo(b)(a1)
 ; CHECK-RV32IF-NEXT:    add a0, a1, a0
 ; CHECK-RV32IF-NEXT:    lui a1, %hi(c)
@@ -138,8 +138,8 @@ define void @foo_fp_i32() nounwind #1 {
 ; CHECK-RV32IFD-NEXT:    sw a1, 0(sp) # 4-byte Folded Spill
 ; CHECK-RV32IFD-NEXT:    addi s0, sp, 16
 ; CHECK-RV32IFD-NEXT:    lui a0, %hi(a)
-; CHECK-RV32IFD-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32IFD-NEXT:    lui a1, %hi(b)
+; CHECK-RV32IFD-NEXT:    lw a0, %lo(a)(a0)
 ; CHECK-RV32IFD-NEXT:    lw a1, %lo(b)(a1)
 ; CHECK-RV32IFD-NEXT:    add a0, a1, a0
 ; CHECK-RV32IFD-NEXT:    lui a1, %hi(c)
@@ -182,8 +182,8 @@ define void @foo_float() nounwind #0 {
 ; CHECK-RV32-NEXT:    sw t5, 4(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    sw t6, 0(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    lui a0, %hi(e)
-; CHECK-RV32-NEXT:    lw a0, %lo(e)(a0)
 ; CHECK-RV32-NEXT:    lui a1, %hi(f)
+; CHECK-RV32-NEXT:    lw a0, %lo(e)(a0)
 ; CHECK-RV32-NEXT:    lw a1, %lo(f)(a1)
 ; CHECK-RV32-NEXT:    call __addsf3
 ; CHECK-RV32-NEXT:    lui a1, %hi(d)
@@ -277,8 +277,8 @@ define void @foo_fp_float() nounwind #1 {
 ; CHECK-RV32-NEXT:    sw t6, 12(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    addi s0, sp, 80
 ; CHECK-RV32-NEXT:    lui a0, %hi(e)
-; CHECK-RV32-NEXT:    lw a0, %lo(e)(a0)
 ; CHECK-RV32-NEXT:    lui a1, %hi(f)
+; CHECK-RV32-NEXT:    lw a0, %lo(e)(a0)
 ; CHECK-RV32-NEXT:    lw a1, %lo(f)(a1)
 ; CHECK-RV32-NEXT:    call __addsf3
 ; CHECK-RV32-NEXT:    lui a1, %hi(d)
@@ -382,9 +382,9 @@ define void @foo_double() nounwind #0 {
 ; CHECK-RV32-NEXT:    sw t5, 4(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    sw t6, 0(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    lui a1, %hi(h)
+; CHECK-RV32-NEXT:    lui a3, %hi(i)
 ; CHECK-RV32-NEXT:    lw a0, %lo(h)(a1)
 ; CHECK-RV32-NEXT:    lw a1, %lo(h+4)(a1)
-; CHECK-RV32-NEXT:    lui a3, %hi(i)
 ; CHECK-RV32-NEXT:    lw a2, %lo(i)(a3)
 ; CHECK-RV32-NEXT:    lw a3, %lo(i+4)(a3)
 ; CHECK-RV32-NEXT:    call __adddf3
@@ -450,9 +450,9 @@ define void @foo_double() nounwind #0 {
 ; CHECK-RV32IF-NEXT:    fsw ft10, 4(sp) # 4-byte Folded Spill
 ; CHECK-RV32IF-NEXT:    fsw ft11, 0(sp) # 4-byte Folded Spill
 ; CHECK-RV32IF-NEXT:    lui a1, %hi(h)
+; CHECK-RV32IF-NEXT:    lui a3, %hi(i)
 ; CHECK-RV32IF-NEXT:    lw a0, %lo(h)(a1)
 ; CHECK-RV32IF-NEXT:    lw a1, %lo(h+4)(a1)
-; CHECK-RV32IF-NEXT:    lui a3, %hi(i)
 ; CHECK-RV32IF-NEXT:    lw a2, %lo(i)(a3)
 ; CHECK-RV32IF-NEXT:    lw a3, %lo(i+4)(a3)
 ; CHECK-RV32IF-NEXT:    call __adddf3
@@ -549,9 +549,9 @@ define void @foo_fp_double() nounwind #1 {
 ; CHECK-RV32-NEXT:    sw t6, 12(sp) # 4-byte Folded Spill
 ; CHECK-RV32-NEXT:    addi s0, sp, 80
 ; CHECK-RV32-NEXT:    lui a1, %hi(h)
+; CHECK-RV32-NEXT:    lui a3, %hi(i)
 ; CHECK-RV32-NEXT:    lw a0, %lo(h)(a1)
 ; CHECK-RV32-NEXT:    lw a1, %lo(h+4)(a1)
-; CHECK-RV32-NEXT:    lui a3, %hi(i)
 ; CHECK-RV32-NEXT:    lw a2, %lo(i)(a3)
 ; CHECK-RV32-NEXT:    lw a3, %lo(i+4)(a3)
 ; CHECK-RV32-NEXT:    call __adddf3
@@ -620,9 +620,9 @@ define void @foo_fp_double() nounwind #1 {
 ; CHECK-RV32IF-NEXT:    fsw ft11, 12(sp) # 4-byte Folded Spill
 ; CHECK-RV32IF-NEXT:    addi s0, sp, 160
 ; CHECK-RV32IF-NEXT:    lui a1, %hi(h)
+; CHECK-RV32IF-NEXT:    lui a3, %hi(i)
 ; CHECK-RV32IF-NEXT:    lw a0, %lo(h)(a1)
 ; CHECK-RV32IF-NEXT:    lw a1, %lo(h+4)(a1)
-; CHECK-RV32IF-NEXT:    lui a3, %hi(i)
 ; CHECK-RV32IF-NEXT:    lw a2, %lo(i)(a3)
 ; CHECK-RV32IF-NEXT:    lw a3, %lo(i+4)(a3)
 ; CHECK-RV32IF-NEXT:    call __adddf3

diff  --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
index 19f40dddeaec25..111b3e2bf82ce3 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -8,17 +8,17 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; RV32-LABEL: ctz_nxv4i32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    srli a0, a0, 1
 ; RV32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vid.v v11
+; RV32-NEXT:    vid.v v10
 ; RV32-NEXT:    li a1, -1
-; RV32-NEXT:    vmadd.vx v11, a1, v10
 ; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v8, 0
+; RV32-NEXT:    srli a0, a0, 1
 ; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a0
+; RV32-NEXT:    vmadd.vx v10, a1, v8
 ; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    vmerge.vvm v8, v8, v11, v0
+; RV32-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    sub a0, a0, a1
@@ -29,17 +29,17 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; RV64-LABEL: ctz_nxv4i32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    srli a0, a0, 1
 ; RV64-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a0
-; RV64-NEXT:    vid.v v11
+; RV64-NEXT:    vid.v v10
 ; RV64-NEXT:    li a1, -1
-; RV64-NEXT:    vmadd.vx v11, a1, v10
 ; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
+; RV64-NEXT:    srli a0, a0, 1
 ; RV64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    vmadd.vx v10, a1, v8
 ; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vvm v8, v8, v11, v0
+; RV64-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
 ; RV64-NEXT:    subw a0, a0, a1
@@ -75,28 +75,28 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    addi a2, sp, 16
 ; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a2), zero
-; RV32-NEXT:    vid.v v16
+; RV32-NEXT:    vlse64.v v16, (a2), zero
+; RV32-NEXT:    vid.v v8
 ; RV32-NEXT:    li a2, -1
-; RV32-NEXT:    vmadd.vx v16, a2, v8
-; RV32-NEXT:    addi a2, sp, 32
-; RV32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vl2r.v v24, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    vmsne.vi v0, v8, 0
+; RV32-NEXT:    vmsne.vi v0, v24, 0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vmadd.vx v8, a2, v16
+; RV32-NEXT:    vmv.v.i v16, 0
+; RV32-NEXT:    li a2, 32
+; RV32-NEXT:    vmerge.vim v16, v16, -1, v0
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
-; RV32-NEXT:    vmv.x.s a2, v8
-; RV32-NEXT:    sltu a3, a0, a2
-; RV32-NEXT:    li a4, 32
+; RV32-NEXT:    vmv.x.s a3, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v8, a4
+; RV32-NEXT:    vsrl.vx v8, v8, a2
+; RV32-NEXT:    sltu a2, a0, a3
 ; RV32-NEXT:    vmv.x.s a4, v8
 ; RV32-NEXT:    sub a1, a1, a4
-; RV32-NEXT:    sub a1, a1, a3
-; RV32-NEXT:    sub a0, a0, a2
+; RV32-NEXT:    sub a1, a1, a2
+; RV32-NEXT:    sub a0, a0, a3
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 1
 ; RV32-NEXT:    add sp, sp, a2
@@ -111,15 +111,15 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vmv.v.x v16, a0
-; RV64-NEXT:    vid.v v24
+; RV64-NEXT:    vid.v v16
 ; RV64-NEXT:    li a1, -1
-; RV64-NEXT:    vmadd.vx v24, a1, v16
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    vmadd.vx v16, a1, v8
 ; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vvm v8, v8, v24, v0
+; RV64-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
 ; RV64-NEXT:    sub a0, a0, a1

diff  --git a/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll b/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll
index c7ba0e501fa44f..4a338ce5bd1f78 100644
--- a/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll
+++ b/llvm/test/CodeGen/RISCV/lack-of-signed-truncation-check.ll
@@ -25,8 +25,8 @@ define i1 @shifts_necmp_i16_i8(i16 %x) nounwind {
 ; RV32I-LABEL: shifts_necmp_i16_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a0, 16
-; RV32I-NEXT:    srli a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srli a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 8
 ; RV32I-NEXT:    srli a0, a0, 16
 ; RV32I-NEXT:    xor a0, a0, a1
@@ -36,8 +36,8 @@ define i1 @shifts_necmp_i16_i8(i16 %x) nounwind {
 ; RV64I-LABEL: shifts_necmp_i16_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a0, 48
-; RV64I-NEXT:    srli a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srli a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 8
 ; RV64I-NEXT:    srli a0, a0, 48
 ; RV64I-NEXT:    xor a0, a0, a1
@@ -638,10 +638,10 @@ define i1 @add_ugecmp_i64_i8(i64 %x) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a2, a0, 128
 ; RV32I-NEXT:    sltu a0, a2, a0
+; RV32I-NEXT:    sltiu a2, a2, 256
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    snez a0, a0
-; RV32I-NEXT:    sltiu a1, a2, 256
-; RV32I-NEXT:    xori a1, a1, 1
+; RV32I-NEXT:    xori a1, a2, 1
 ; RV32I-NEXT:    or a0, a1, a0
 ; RV32I-NEXT:    ret
 ;
@@ -754,9 +754,9 @@ define i1 @add_ugecmp_bad_i16_i8_cmp(i16 %x, i16 %y) nounwind {
 ; RV32I-LABEL: add_ugecmp_bad_i16_i8_cmp:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    addi a0, a0, 128
 ; RV32I-NEXT:    addi a2, a2, -1
 ; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    addi a0, a0, 128
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    sltu a0, a0, a1
 ; RV32I-NEXT:    xori a0, a0, 1
@@ -765,9 +765,9 @@ define i1 @add_ugecmp_bad_i16_i8_cmp(i16 %x, i16 %y) nounwind {
 ; RV64I-LABEL: add_ugecmp_bad_i16_i8_cmp:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addi a0, a0, 128
 ; RV64I-NEXT:    addiw a2, a2, -1
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    addi a0, a0, 128
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    sltu a0, a0, a1
 ; RV64I-NEXT:    xori a0, a0, 1

diff  --git a/llvm/test/CodeGen/RISCV/llvm.exp10.ll b/llvm/test/CodeGen/RISCV/llvm.exp10.ll
index a1f8bd4ab12bea..15a123400fd4fa 100644
--- a/llvm/test/CodeGen/RISCV/llvm.exp10.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.exp10.ll
@@ -187,12 +187,12 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) {
 ; RV32IFD-NEXT:    .cfi_offset fs0, -24
 ; RV32IFD-NEXT:    .cfi_offset fs1, -32
 ; RV32IFD-NEXT:    .cfi_offset fs2, -40
-; RV32IFD-NEXT:    lhu a2, 8(a1)
-; RV32IFD-NEXT:    lhu a3, 0(a1)
-; RV32IFD-NEXT:    lhu a1, 4(a1)
 ; RV32IFD-NEXT:    mv s0, a0
-; RV32IFD-NEXT:    fmv.w.x fs0, a2
-; RV32IFD-NEXT:    fmv.w.x fs1, a3
+; RV32IFD-NEXT:    lhu a0, 8(a1)
+; RV32IFD-NEXT:    lhu a2, 0(a1)
+; RV32IFD-NEXT:    lhu a1, 4(a1)
+; RV32IFD-NEXT:    fmv.w.x fs0, a0
+; RV32IFD-NEXT:    fmv.w.x fs1, a2
 ; RV32IFD-NEXT:    fmv.w.x fa0, a1
 ; RV32IFD-NEXT:    call __extendhfsf2
 ; RV32IFD-NEXT:    call exp10f

diff  --git a/llvm/test/CodeGen/RISCV/llvm.frexp.ll b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
index e85a7118f5ff83..74dec76a02e892 100644
--- a/llvm/test/CodeGen/RISCV/llvm.frexp.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
@@ -61,8 +61,8 @@ define { half, i32 } @test_frexp_f16_i32(half %a) nounwind {
 ; RV32IZFINXZDINX-NEXT:    addi a1, sp, 8
 ; RV32IZFINXZDINX-NEXT:    call frexpf
 ; RV32IZFINXZDINX-NEXT:    call __truncsfhf2
-; RV32IZFINXZDINX-NEXT:    lw a1, 8(sp)
 ; RV32IZFINXZDINX-NEXT:    # kill: def $x10_w killed $x10_w def $x10
+; RV32IZFINXZDINX-NEXT:    lw a1, 8(sp)
 ; RV32IZFINXZDINX-NEXT:    lui a2, 1048560
 ; RV32IZFINXZDINX-NEXT:    or a0, a0, a2
 ; RV32IZFINXZDINX-NEXT:    # kill: def $x10_w killed $x10_w killed $x10
@@ -78,8 +78,8 @@ define { half, i32 } @test_frexp_f16_i32(half %a) nounwind {
 ; RV64IZFINXZDINX-NEXT:    mv a1, sp
 ; RV64IZFINXZDINX-NEXT:    call frexpf
 ; RV64IZFINXZDINX-NEXT:    call __truncsfhf2
-; RV64IZFINXZDINX-NEXT:    ld a1, 0(sp)
 ; RV64IZFINXZDINX-NEXT:    # kill: def $x10_w killed $x10_w def $x10
+; RV64IZFINXZDINX-NEXT:    ld a1, 0(sp)
 ; RV64IZFINXZDINX-NEXT:    lui a2, 1048560
 ; RV64IZFINXZDINX-NEXT:    or a0, a0, a2
 ; RV64IZFINXZDINX-NEXT:    # kill: def $x10_w killed $x10_w killed $x10

diff  --git a/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll b/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll
index 8693283e83712d..43719a452c2360 100644
--- a/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll
+++ b/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll
@@ -32,19 +32,19 @@ define void @test(i32 signext %row, i32 signext %N.in) nounwind {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    blez a1, .LBB0_3
 ; RV32-NEXT:  # %bb.1: # %cond_true.preheader
-; RV32-NEXT:    slli a0, a0, 6
-; RV32-NEXT:    lui a2, %hi(A)
-; RV32-NEXT:    addi a2, a2, %lo(A)
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    addi a0, a0, 8
-; RV32-NEXT:    li a2, 4
+; RV32-NEXT:    slli a2, a0, 6
+; RV32-NEXT:    lui a3, %hi(A)
+; RV32-NEXT:    addi a3, a3, %lo(A)
+; RV32-NEXT:    li a0, 4
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    addi a2, a2, 8
 ; RV32-NEXT:    li a3, 5
 ; RV32-NEXT:  .LBB0_2: # %cond_true
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32-NEXT:    sw a2, -4(a0)
-; RV32-NEXT:    sw a3, 0(a0)
+; RV32-NEXT:    sw a0, -4(a2)
+; RV32-NEXT:    sw a3, 0(a2)
 ; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    addi a0, a0, 4
+; RV32-NEXT:    addi a2, a2, 4
 ; RV32-NEXT:    bnez a1, .LBB0_2
 ; RV32-NEXT:  .LBB0_3: # %return
 ; RV32-NEXT:    ret
@@ -53,24 +53,24 @@ define void @test(i32 signext %row, i32 signext %N.in) nounwind {
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    blez a1, .LBB0_3
 ; RV64-NEXT:  # %bb.1: # %cond_true.preheader
-; RV64-NEXT:    slli a0, a0, 6
-; RV64-NEXT:    lui a2, %hi(A)
-; RV64-NEXT:    addi a2, a2, %lo(A)
-; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    addi a2, a0, 4
+; RV64-NEXT:    slli a3, a0, 6
+; RV64-NEXT:    lui a4, %hi(A)
+; RV64-NEXT:    addi a4, a4, %lo(A)
 ; RV64-NEXT:    addiw a1, a1, 2
-; RV64-NEXT:    li a3, 2
-; RV64-NEXT:    li a4, 4
+; RV64-NEXT:    li a0, 2
+; RV64-NEXT:    li a2, 4
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    addi a4, a3, 4
 ; RV64-NEXT:    li a5, 5
 ; RV64-NEXT:  .LBB0_2: # %cond_true
 ; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64-NEXT:    sw a4, 0(a2)
-; RV64-NEXT:    slli a6, a3, 2
-; RV64-NEXT:    add a6, a0, a6
+; RV64-NEXT:    sw a2, 0(a4)
+; RV64-NEXT:    slli a6, a0, 2
+; RV64-NEXT:    addiw a0, a0, 1
+; RV64-NEXT:    add a6, a3, a6
 ; RV64-NEXT:    sw a5, 0(a6)
-; RV64-NEXT:    addiw a3, a3, 1
-; RV64-NEXT:    addi a2, a2, 4
-; RV64-NEXT:    bne a3, a1, .LBB0_2
+; RV64-NEXT:    addi a4, a4, 4
+; RV64-NEXT:    bne a0, a1, .LBB0_2
 ; RV64-NEXT:  .LBB0_3: # %return
 ; RV64-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/RISCV/lsr-legaladdimm.ll b/llvm/test/CodeGen/RISCV/lsr-legaladdimm.ll
index 2c8839683d8162..b2ccbd821eb8ea 100644
--- a/llvm/test/CodeGen/RISCV/lsr-legaladdimm.ll
+++ b/llvm/test/CodeGen/RISCV/lsr-legaladdimm.ll
@@ -20,10 +20,10 @@ define i32 @main() nounwind {
 ; RV32I-NEXT:  .LBB0_1: # %for.body
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32I-NEXT:    addi a4, a0, -2048
-; RV32I-NEXT:    sw a4, 0(a2)
 ; RV32I-NEXT:    sw a0, 0(a1)
 ; RV32I-NEXT:    addi a0, a0, 1
 ; RV32I-NEXT:    addi a1, a1, 4
+; RV32I-NEXT:    sw a4, 0(a2)
 ; RV32I-NEXT:    addi a2, a2, 4
 ; RV32I-NEXT:    bne a0, a3, .LBB0_1
 ; RV32I-NEXT:  # %bb.2: # %for.end

diff  --git a/llvm/test/CodeGen/RISCV/machine-combiner.ll b/llvm/test/CodeGen/RISCV/machine-combiner.ll
index ebf232cc458ba0..a18f5d6902dca7 100644
--- a/llvm/test/CodeGen/RISCV/machine-combiner.ll
+++ b/llvm/test/CodeGen/RISCV/machine-combiner.ll
@@ -10,9 +10,9 @@
 define double @test_reassoc_fadd1(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fadd1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fadd.d fa5, fa0, fa1
-; CHECK-NEXT:    fadd.d fa4, fa2, fa3
-; CHECK-NEXT:    fadd.d fa0, fa5, fa4
+; CHECK-NEXT:    fadd.d fa5, fa2, fa3
+; CHECK-NEXT:    fadd.d fa4, fa0, fa1
+; CHECK-NEXT:    fadd.d fa0, fa4, fa5
 ; CHECK-NEXT:    ret
   %t0 = fadd nsz reassoc double %a0, %a1
   %t1 = fadd nsz reassoc double %t0, %a2
@@ -23,9 +23,9 @@ define double @test_reassoc_fadd1(double %a0, double %a1, double %a2, double %a3
 define double @test_reassoc_fadd2(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fadd2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fadd.d fa5, fa0, fa1
-; CHECK-NEXT:    fadd.d fa4, fa2, fa3
-; CHECK-NEXT:    fadd.d fa0, fa4, fa5
+; CHECK-NEXT:    fadd.d fa5, fa2, fa3
+; CHECK-NEXT:    fadd.d fa4, fa0, fa1
+; CHECK-NEXT:    fadd.d fa0, fa5, fa4
 ; CHECK-NEXT:    ret
   %t0 = fadd nsz reassoc double %a0, %a1
   %t1 = fadd nsz reassoc double %a2, %t0
@@ -36,9 +36,9 @@ define double @test_reassoc_fadd2(double %a0, double %a1, double %a2, double %a3
 define double @test_reassoc_fadd3(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fadd3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fadd.d fa5, fa0, fa1
-; CHECK-NEXT:    fadd.d fa4, fa3, fa2
-; CHECK-NEXT:    fadd.d fa0, fa4, fa5
+; CHECK-NEXT:    fadd.d fa5, fa3, fa2
+; CHECK-NEXT:    fadd.d fa4, fa0, fa1
+; CHECK-NEXT:    fadd.d fa0, fa5, fa4
 ; CHECK-NEXT:    ret
   %t0 = fadd nsz reassoc double %a0, %a1
   %t1 = fadd nsz reassoc double %t0, %a2
@@ -49,9 +49,9 @@ define double @test_reassoc_fadd3(double %a0, double %a1, double %a2, double %a3
 define double @test_reassoc_fadd4(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fadd4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fadd.d fa5, fa0, fa1
-; CHECK-NEXT:    fadd.d fa4, fa3, fa2
-; CHECK-NEXT:    fadd.d fa0, fa4, fa5
+; CHECK-NEXT:    fadd.d fa5, fa3, fa2
+; CHECK-NEXT:    fadd.d fa4, fa0, fa1
+; CHECK-NEXT:    fadd.d fa0, fa5, fa4
 ; CHECK-NEXT:    ret
   %t0 = fadd nsz reassoc double %a0, %a1
   %t1 = fadd nsz reassoc double %a2, %t0
@@ -62,9 +62,9 @@ define double @test_reassoc_fadd4(double %a0, double %a1, double %a2, double %a3
 define double @test_reassoc_fmul1(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fmul1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmul.d fa5, fa0, fa1
-; CHECK-NEXT:    fmul.d fa4, fa2, fa3
-; CHECK-NEXT:    fmul.d fa0, fa5, fa4
+; CHECK-NEXT:    fmul.d fa5, fa2, fa3
+; CHECK-NEXT:    fmul.d fa4, fa0, fa1
+; CHECK-NEXT:    fmul.d fa0, fa4, fa5
 ; CHECK-NEXT:    ret
   %t0 = fmul nsz reassoc double %a0, %a1
   %t1 = fmul nsz reassoc double %t0, %a2
@@ -75,9 +75,9 @@ define double @test_reassoc_fmul1(double %a0, double %a1, double %a2, double %a3
 define double @test_reassoc_fmul2(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fmul2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmul.d fa5, fa0, fa1
-; CHECK-NEXT:    fmul.d fa4, fa2, fa3
-; CHECK-NEXT:    fmul.d fa0, fa4, fa5
+; CHECK-NEXT:    fmul.d fa5, fa2, fa3
+; CHECK-NEXT:    fmul.d fa4, fa0, fa1
+; CHECK-NEXT:    fmul.d fa0, fa5, fa4
 ; CHECK-NEXT:    ret
   %t0 = fmul nsz reassoc double %a0, %a1
   %t1 = fmul nsz reassoc double %a2, %t0
@@ -88,9 +88,9 @@ define double @test_reassoc_fmul2(double %a0, double %a1, double %a2, double %a3
 define double @test_reassoc_fmul3(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fmul3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmul.d fa5, fa0, fa1
-; CHECK-NEXT:    fmul.d fa4, fa3, fa2
-; CHECK-NEXT:    fmul.d fa0, fa4, fa5
+; CHECK-NEXT:    fmul.d fa5, fa3, fa2
+; CHECK-NEXT:    fmul.d fa4, fa0, fa1
+; CHECK-NEXT:    fmul.d fa0, fa5, fa4
 ; CHECK-NEXT:    ret
   %t0 = fmul nsz reassoc double %a0, %a1
   %t1 = fmul nsz reassoc double %t0, %a2
@@ -101,9 +101,9 @@ define double @test_reassoc_fmul3(double %a0, double %a1, double %a2, double %a3
 define double @test_reassoc_fmul4(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fmul4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmul.d fa5, fa0, fa1
-; CHECK-NEXT:    fmul.d fa4, fa3, fa2
-; CHECK-NEXT:    fmul.d fa0, fa4, fa5
+; CHECK-NEXT:    fmul.d fa5, fa3, fa2
+; CHECK-NEXT:    fmul.d fa4, fa0, fa1
+; CHECK-NEXT:    fmul.d fa0, fa5, fa4
 ; CHECK-NEXT:    ret
   %t0 = fmul nsz reassoc double %a0, %a1
   %t1 = fmul nsz reassoc double %a2, %t0
@@ -114,11 +114,11 @@ define double @test_reassoc_fmul4(double %a0, double %a1, double %a2, double %a3
 define double @test_reassoc_big1(double %a0, double %a1, double %a2, double %a3, double %a4, double %a5, double %a6) {
 ; CHECK-LABEL: test_reassoc_big1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fadd.d fa1, fa0, fa1
-; CHECK-NEXT:    fadd.d fa3, fa2, fa3
 ; CHECK-NEXT:    fadd.d fa5, fa4, fa5
-; CHECK-NEXT:    fadd.d fa4, fa1, fa3
+; CHECK-NEXT:    fadd.d fa4, fa2, fa3
+; CHECK-NEXT:    fadd.d fa3, fa0, fa1
 ; CHECK-NEXT:    fadd.d fa5, fa5, fa6
+; CHECK-NEXT:    fadd.d fa4, fa3, fa4
 ; CHECK-NEXT:    fadd.d fa0, fa4, fa5
 ; CHECK-NEXT:    ret
   %t0 = fadd nsz reassoc double %a0, %a1
@@ -133,18 +133,18 @@ define double @test_reassoc_big1(double %a0, double %a1, double %a2, double %a3,
 define double @test_reassoc_big2(double %a0, double %a1, i32 %a2, double %a3, i32 %a4, double %a5) {
 ; CHECK-LABEL: test_reassoc_big2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fadd.d fa5, fa0, fa1
 ; CHECK-NEXT:    fsub.d fa4, fa3, fa2
-; CHECK-NEXT:    fadd.d fa3, fa2, fa1
-; CHECK-NEXT:    fcvt.d.w ft0, a0
-; CHECK-NEXT:    fcvt.d.w ft1, a1
-; CHECK-NEXT:    fmul.d fa2, fa2, ft0
-; CHECK-NEXT:    fmul.d fa1, ft1, fa1
-; CHECK-NEXT:    fsub.d fa5, fa4, fa5
-; CHECK-NEXT:    fmul.d fa4, fa0, fa3
-; CHECK-NEXT:    fmul.d fa3, fa1, fa2
-; CHECK-NEXT:    fmul.d fa5, fa5, fa4
-; CHECK-NEXT:    fmul.d fa0, fa5, fa3
+; CHECK-NEXT:    fadd.d fa3, fa0, fa1
+; CHECK-NEXT:    fadd.d ft0, fa2, fa1
+; CHECK-NEXT:    fcvt.d.w fa5, a1
+; CHECK-NEXT:    fcvt.d.w ft1, a0
+; CHECK-NEXT:    fmul.d fa5, fa5, fa1
+; CHECK-NEXT:    fmul.d fa2, fa2, ft1
+; CHECK-NEXT:    fsub.d fa4, fa4, fa3
+; CHECK-NEXT:    fmul.d fa3, fa0, ft0
+; CHECK-NEXT:    fmul.d fa5, fa5, fa2
+; CHECK-NEXT:    fmul.d fa4, fa4, fa3
+; CHECK-NEXT:    fmul.d fa0, fa4, fa5
 ; CHECK-NEXT:    ret
   %cvt1 = sitofp i32 %a2 to double
   %cvt2 = sitofp i32 %a4 to double
@@ -245,9 +245,9 @@ define double @test_fnmsub(double %a0, double %a1, double %a2) {
 define double @test_reassoc_fsub1(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fsub1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fadd.d fa5, fa0, fa1
-; CHECK-NEXT:    fsub.d fa4, fa2, fa3
-; CHECK-NEXT:    fadd.d fa0, fa5, fa4
+; CHECK-NEXT:    fsub.d fa5, fa2, fa3
+; CHECK-NEXT:    fadd.d fa4, fa0, fa1
+; CHECK-NEXT:    fadd.d fa0, fa4, fa5
 ; CHECK-NEXT:    ret
   %t0 = fadd nsz reassoc double %a0, %a1
   %t1 = fadd nsz reassoc double %t0, %a2
@@ -258,9 +258,9 @@ define double @test_reassoc_fsub1(double %a0, double %a1, double %a2, double %a3
 define double @test_reassoc_fsub2(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fsub2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fadd.d fa5, fa0, fa1
-; CHECK-NEXT:    fsub.d fa4, fa2, fa3
-; CHECK-NEXT:    fsub.d fa0, fa5, fa4
+; CHECK-NEXT:    fsub.d fa5, fa2, fa3
+; CHECK-NEXT:    fadd.d fa4, fa0, fa1
+; CHECK-NEXT:    fsub.d fa0, fa4, fa5
 ; CHECK-NEXT:    ret
   %t0 = fadd nsz reassoc double %a0, %a1
   %t1 = fsub nsz reassoc double %t0, %a2
@@ -271,9 +271,9 @@ define double @test_reassoc_fsub2(double %a0, double %a1, double %a2, double %a3
 define double @test_reassoc_fsub3(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fsub3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fadd.d fa5, fa0, fa1
-; CHECK-NEXT:    fadd.d fa4, fa2, fa3
-; CHECK-NEXT:    fsub.d fa0, fa5, fa4
+; CHECK-NEXT:    fadd.d fa5, fa2, fa3
+; CHECK-NEXT:    fadd.d fa4, fa0, fa1
+; CHECK-NEXT:    fsub.d fa0, fa4, fa5
 ; CHECK-NEXT:    ret
   %t0 = fadd nsz reassoc double %a0, %a1
   %t1 = fsub nsz reassoc double %t0, %a2
@@ -284,9 +284,9 @@ define double @test_reassoc_fsub3(double %a0, double %a1, double %a2, double %a3
 define double @test_reassoc_fsub4(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fsub4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fadd.d fa5, fa0, fa1
-; CHECK-NEXT:    fsub.d fa4, fa2, fa3
-; CHECK-NEXT:    fadd.d fa0, fa4, fa5
+; CHECK-NEXT:    fsub.d fa5, fa2, fa3
+; CHECK-NEXT:    fadd.d fa4, fa0, fa1
+; CHECK-NEXT:    fadd.d fa0, fa5, fa4
 ; CHECK-NEXT:    ret
   %t0 = fadd nsz reassoc double %a0, %a1
   %t1 = fadd nsz reassoc double %a2, %t0
@@ -297,9 +297,9 @@ define double @test_reassoc_fsub4(double %a0, double %a1, double %a2, double %a3
 define double @test_reassoc_fsub5(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fsub5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fadd.d fa5, fa0, fa1
-; CHECK-NEXT:    fadd.d fa4, fa2, fa3
-; CHECK-NEXT:    fsub.d fa0, fa4, fa5
+; CHECK-NEXT:    fadd.d fa5, fa2, fa3
+; CHECK-NEXT:    fadd.d fa4, fa0, fa1
+; CHECK-NEXT:    fsub.d fa0, fa5, fa4
 ; CHECK-NEXT:    ret
   %t0 = fadd nsz reassoc double %a0, %a1
   %t1 = fsub nsz reassoc double %a2, %t0
@@ -310,9 +310,9 @@ define double @test_reassoc_fsub5(double %a0, double %a1, double %a2, double %a3
 define double @test_reassoc_fsub6(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fsub6:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fadd.d fa5, fa0, fa1
-; CHECK-NEXT:    fsub.d fa4, fa2, fa3
-; CHECK-NEXT:    fsub.d fa0, fa4, fa5
+; CHECK-NEXT:    fsub.d fa5, fa2, fa3
+; CHECK-NEXT:    fadd.d fa4, fa0, fa1
+; CHECK-NEXT:    fsub.d fa0, fa5, fa4
 ; CHECK-NEXT:    ret
   %t0 = fadd nsz reassoc double %a0, %a1
   %t1 = fsub nsz reassoc double %a2, %t0
@@ -323,9 +323,9 @@ define double @test_reassoc_fsub6(double %a0, double %a1, double %a2, double %a3
 define double @test_reassoc_fsub7(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fsub7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fadd.d fa5, fa0, fa1
-; CHECK-NEXT:    fsub.d fa4, fa3, fa2
-; CHECK-NEXT:    fsub.d fa0, fa4, fa5
+; CHECK-NEXT:    fsub.d fa5, fa3, fa2
+; CHECK-NEXT:    fadd.d fa4, fa0, fa1
+; CHECK-NEXT:    fsub.d fa0, fa5, fa4
 ; CHECK-NEXT:    ret
   %t0 = fadd nsz reassoc double %a0, %a1
   %t1 = fadd nsz reassoc double %t0, %a2
@@ -336,9 +336,9 @@ define double @test_reassoc_fsub7(double %a0, double %a1, double %a2, double %a3
 define double @test_reassoc_fsub8(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fsub8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fadd.d fa5, fa0, fa1
-; CHECK-NEXT:    fsub.d fa4, fa3, fa2
-; CHECK-NEXT:    fadd.d fa0, fa4, fa5
+; CHECK-NEXT:    fsub.d fa5, fa3, fa2
+; CHECK-NEXT:    fadd.d fa4, fa0, fa1
+; CHECK-NEXT:    fadd.d fa0, fa5, fa4
 ; CHECK-NEXT:    ret
   %t0 = fadd nsz reassoc double %a0, %a1
   %t1 = fsub nsz reassoc double %t0, %a2
@@ -349,9 +349,9 @@ define double @test_reassoc_fsub8(double %a0, double %a1, double %a2, double %a3
 define double @test_reassoc_fsub9(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fsub9:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fadd.d fa5, fa0, fa1
-; CHECK-NEXT:    fadd.d fa4, fa3, fa2
-; CHECK-NEXT:    fsub.d fa0, fa4, fa5
+; CHECK-NEXT:    fadd.d fa5, fa3, fa2
+; CHECK-NEXT:    fadd.d fa4, fa0, fa1
+; CHECK-NEXT:    fsub.d fa0, fa5, fa4
 ; CHECK-NEXT:    ret
   %t0 = fadd nsz reassoc double %a0, %a1
   %t1 = fsub nsz reassoc double %t0, %a2
@@ -362,9 +362,9 @@ define double @test_reassoc_fsub9(double %a0, double %a1, double %a2, double %a3
 define double @test_reassoc_fsub10(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fsub10:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fadd.d fa5, fa0, fa1
-; CHECK-NEXT:    fsub.d fa4, fa3, fa2
-; CHECK-NEXT:    fsub.d fa0, fa4, fa5
+; CHECK-NEXT:    fsub.d fa5, fa3, fa2
+; CHECK-NEXT:    fadd.d fa4, fa0, fa1
+; CHECK-NEXT:    fsub.d fa0, fa5, fa4
 ; CHECK-NEXT:    ret
   %t0 = fadd nsz reassoc double %a0, %a1
   %t1 = fadd nsz reassoc double %a2, %t0
@@ -375,9 +375,9 @@ define double @test_reassoc_fsub10(double %a0, double %a1, double %a2, double %a
 define double @test_reassoc_fsub11(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fsub11:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fadd.d fa5, fa0, fa1
-; CHECK-NEXT:    fadd.d fa4, fa3, fa2
-; CHECK-NEXT:    fsub.d fa0, fa4, fa5
+; CHECK-NEXT:    fadd.d fa5, fa3, fa2
+; CHECK-NEXT:    fadd.d fa4, fa0, fa1
+; CHECK-NEXT:    fsub.d fa0, fa5, fa4
 ; CHECK-NEXT:    ret
   %t0 = fadd nsz reassoc double %a0, %a1
   %t1 = fsub nsz reassoc double %a2, %t0
@@ -388,9 +388,9 @@ define double @test_reassoc_fsub11(double %a0, double %a1, double %a2, double %a
 define double @test_reassoc_fsub12(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_reassoc_fsub12:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fadd.d fa5, fa0, fa1
-; CHECK-NEXT:    fsub.d fa4, fa3, fa2
-; CHECK-NEXT:    fadd.d fa0, fa4, fa5
+; CHECK-NEXT:    fsub.d fa5, fa3, fa2
+; CHECK-NEXT:    fadd.d fa4, fa0, fa1
+; CHECK-NEXT:    fadd.d fa0, fa5, fa4
 ; CHECK-NEXT:    ret
   %t0 = fadd nsz reassoc double %a0, %a1
   %t1 = fsub nsz reassoc double %a2, %t0
@@ -687,9 +687,9 @@ define i64 @test_reassoc_xor_i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) {
 define i8 @test_reassoc_mul_i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3) {
 ; CHECK-LABEL: test_reassoc_mul_i8:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    mul a1, a2, a3
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    mul a0, a0, a2
 ; CHECK-NEXT:    ret
   %t0 = mul i8 %a0, %a1
   %t1 = mul i8 %t0, %a2
@@ -700,9 +700,9 @@ define i8 @test_reassoc_mul_i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3) {
 define i16 @test_reassoc_mul_i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3) {
 ; CHECK-LABEL: test_reassoc_mul_i16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    mul a1, a2, a3
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    mul a0, a0, a2
 ; CHECK-NEXT:    ret
   %t0 = mul i16 %a0, %a1
   %t1 = mul i16 %t0, %a2
@@ -713,9 +713,9 @@ define i16 @test_reassoc_mul_i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3) {
 define i32 @test_reassoc_mul_i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 ; CHECK-LABEL: test_reassoc_mul_i32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    mul a1, a2, a3
-; CHECK-NEXT:    mulw a0, a0, a1
+; CHECK-NEXT:    mulw a0, a0, a2
 ; CHECK-NEXT:    ret
   %t0 = mul i32 %a0, %a1
   %t1 = mul i32 %t0, %a2
@@ -726,9 +726,9 @@ define i32 @test_reassoc_mul_i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
 define i64 @test_reassoc_mul_i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) {
 ; CHECK-LABEL: test_reassoc_mul_i64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    mul a1, a2, a3
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    mul a0, a0, a2
 ; CHECK-NEXT:    ret
   %t0 = mul i64 %a0, %a1
   %t1 = mul i64 %t0, %a2
@@ -995,9 +995,9 @@ define i64 @test_reassoc_max_i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) {
 define half @test_fmin_f16(half %a0, half %a1, half %a2, half %a3) {
 ; CHECK-LABEL: test_fmin_f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmin.h fa5, fa0, fa1
-; CHECK-NEXT:    fmin.h fa4, fa2, fa3
-; CHECK-NEXT:    fmin.h fa0, fa5, fa4
+; CHECK-NEXT:    fmin.h fa5, fa2, fa3
+; CHECK-NEXT:    fmin.h fa4, fa0, fa1
+; CHECK-NEXT:    fmin.h fa0, fa4, fa5
 ; CHECK-NEXT:    ret
   %t0 = call half @llvm.minnum.f16(half %a0, half %a1)
   %t1 = call half @llvm.minnum.f16(half %t0, half %a2)
@@ -1008,9 +1008,9 @@ define half @test_fmin_f16(half %a0, half %a1, half %a2, half %a3) {
 define float @test_fmin_f32(float %a0, float %a1, float %a2, float %a3) {
 ; CHECK-LABEL: test_fmin_f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmin.s fa5, fa0, fa1
-; CHECK-NEXT:    fmin.s fa4, fa2, fa3
-; CHECK-NEXT:    fmin.s fa0, fa5, fa4
+; CHECK-NEXT:    fmin.s fa5, fa2, fa3
+; CHECK-NEXT:    fmin.s fa4, fa0, fa1
+; CHECK-NEXT:    fmin.s fa0, fa4, fa5
 ; CHECK-NEXT:    ret
   %t0 = call float @llvm.minnum.f32(float %a0, float %a1)
   %t1 = call float @llvm.minnum.f32(float %t0, float %a2)
@@ -1021,9 +1021,9 @@ define float @test_fmin_f32(float %a0, float %a1, float %a2, float %a3) {
 define double @test_fmin_f64(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_fmin_f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmin.d fa5, fa0, fa1
-; CHECK-NEXT:    fmin.d fa4, fa2, fa3
-; CHECK-NEXT:    fmin.d fa0, fa5, fa4
+; CHECK-NEXT:    fmin.d fa5, fa2, fa3
+; CHECK-NEXT:    fmin.d fa4, fa0, fa1
+; CHECK-NEXT:    fmin.d fa0, fa4, fa5
 ; CHECK-NEXT:    ret
   %t0 = call double @llvm.minnum.f64(double %a0, double %a1)
   %t1 = call double @llvm.minnum.f64(double %t0, double %a2)
@@ -1034,9 +1034,9 @@ define double @test_fmin_f64(double %a0, double %a1, double %a2, double %a3) {
 define half @test_fmax_f16(half %a0, half %a1, half %a2, half %a3) {
 ; CHECK-LABEL: test_fmax_f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmax.h fa5, fa0, fa1
-; CHECK-NEXT:    fmax.h fa4, fa2, fa3
-; CHECK-NEXT:    fmax.h fa0, fa5, fa4
+; CHECK-NEXT:    fmax.h fa5, fa2, fa3
+; CHECK-NEXT:    fmax.h fa4, fa0, fa1
+; CHECK-NEXT:    fmax.h fa0, fa4, fa5
 ; CHECK-NEXT:    ret
   %t0 = call half @llvm.maxnum.f16(half %a0, half %a1)
   %t1 = call half @llvm.maxnum.f16(half %t0, half %a2)
@@ -1047,9 +1047,9 @@ define half @test_fmax_f16(half %a0, half %a1, half %a2, half %a3) {
 define float @test_fmax_f32(float %a0, float %a1, float %a2, float %a3) {
 ; CHECK-LABEL: test_fmax_f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmax.s fa5, fa0, fa1
-; CHECK-NEXT:    fmax.s fa4, fa2, fa3
-; CHECK-NEXT:    fmax.s fa0, fa5, fa4
+; CHECK-NEXT:    fmax.s fa5, fa2, fa3
+; CHECK-NEXT:    fmax.s fa4, fa0, fa1
+; CHECK-NEXT:    fmax.s fa0, fa4, fa5
 ; CHECK-NEXT:    ret
   %t0 = call float @llvm.maxnum.f32(float %a0, float %a1)
   %t1 = call float @llvm.maxnum.f32(float %t0, float %a2)
@@ -1060,9 +1060,9 @@ define float @test_fmax_f32(float %a0, float %a1, float %a2, float %a3) {
 define double @test_fmax_f64(double %a0, double %a1, double %a2, double %a3) {
 ; CHECK-LABEL: test_fmax_f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmax.d fa5, fa0, fa1
-; CHECK-NEXT:    fmax.d fa4, fa2, fa3
-; CHECK-NEXT:    fmax.d fa0, fa5, fa4
+; CHECK-NEXT:    fmax.d fa5, fa2, fa3
+; CHECK-NEXT:    fmax.d fa4, fa0, fa1
+; CHECK-NEXT:    fmax.d fa0, fa4, fa5
 ; CHECK-NEXT:    ret
   %t0 = call double @llvm.maxnum.f64(double %a0, double %a1)
   %t1 = call double @llvm.maxnum.f64(double %t0, double %a2)

diff  --git a/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll
index 27297c97871832..af8105644b57db 100644
--- a/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll
+++ b/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll
@@ -97,10 +97,10 @@ ret:
 define void @test_la_tls_ie(i32 signext %n) {
 ; RV32I-LABEL: test_la_tls_ie:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:  .Lpcrel_hi2:
-; RV32I-NEXT:    auipc a1, %tls_ie_pcrel_hi(ie)
-; RV32I-NEXT:    lw a2, %pcrel_lo(.Lpcrel_hi2)(a1)
 ; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:  .Lpcrel_hi2:
+; RV32I-NEXT:    auipc a2, %tls_ie_pcrel_hi(ie)
+; RV32I-NEXT:    lw a2, %pcrel_lo(.Lpcrel_hi2)(a2)
 ; RV32I-NEXT:    add a2, a2, tp
 ; RV32I-NEXT:  .LBB2_1: # %loop
 ; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -112,10 +112,10 @@ define void @test_la_tls_ie(i32 signext %n) {
 ;
 ; RV64I-LABEL: test_la_tls_ie:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:  .Lpcrel_hi2:
-; RV64I-NEXT:    auipc a1, %tls_ie_pcrel_hi(ie)
-; RV64I-NEXT:    ld a2, %pcrel_lo(.Lpcrel_hi2)(a1)
 ; RV64I-NEXT:    li a1, 0
+; RV64I-NEXT:  .Lpcrel_hi2:
+; RV64I-NEXT:    auipc a2, %tls_ie_pcrel_hi(ie)
+; RV64I-NEXT:    ld a2, %pcrel_lo(.Lpcrel_hi2)(a2)
 ; RV64I-NEXT:    add a2, a2, tp
 ; RV64I-NEXT:  .LBB2_1: # %loop
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1

diff  --git a/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll b/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll
index e30bdfb939471f..83e9bf661ab1ca 100644
--- a/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll
+++ b/llvm/test/CodeGen/RISCV/machinelicm-constant-phys-reg.ll
@@ -13,10 +13,10 @@ define i32 @test(ptr %a, i64 %n)  {
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1re32.v v9, (a0)
 ; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    vredsum.vs v9, v9, v8
 ; CHECK-NEXT:    vmv.x.s a3, v9
 ; CHECK-NEXT:    addw a3, a3, a3
-; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    addi a0, a0, 8
 ; CHECK-NEXT:    bnez a1, .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %exit

diff  --git a/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll b/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll
index b45365e7a8b631..c2882fd46c17da 100644
--- a/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll
+++ b/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll
@@ -18,16 +18,16 @@ define void @foo(i32 signext %0, i32 signext %1) {
 ;
 ; FUSION-LABEL: foo:
 ; FUSION:       # %bb.0:
-; FUSION-NEXT:    fcvt.s.w fa0, a1
 ; FUSION-NEXT:    lui a0, %hi(.L.str)
 ; FUSION-NEXT:    addi a0, a0, %lo(.L.str)
+; FUSION-NEXT:    fcvt.s.w fa0, a1
 ; FUSION-NEXT:    tail bar
 ;
 ; FUSION-POSTRA-LABEL: foo:
 ; FUSION-POSTRA:       # %bb.0:
-; FUSION-POSTRA-NEXT:    fcvt.s.w fa0, a1
 ; FUSION-POSTRA-NEXT:    lui a0, %hi(.L.str)
 ; FUSION-POSTRA-NEXT:    addi a0, a0, %lo(.L.str)
+; FUSION-POSTRA-NEXT:    fcvt.s.w fa0, a1
 ; FUSION-POSTRA-NEXT:    tail bar
   %3 = sitofp i32 %1 to float
   tail call void @bar(ptr @.str, float %3)

diff  --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
index e852579c724f8e..d529ae6ecd0aba 100644
--- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
@@ -2283,8 +2283,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a0, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a1, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a0, a0, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a0, a0, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a1, a1, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
@@ -2294,8 +2294,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
@@ -2305,8 +2305,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a0, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a1, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a0, a0, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a0, a0, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a1, a1, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
@@ -2316,8 +2316,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
@@ -2452,8 +2452,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a2, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a3, a3, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB24_2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
@@ -2472,8 +2472,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB24_2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
@@ -2492,8 +2492,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a2, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a3, a3, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB24_2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
@@ -2512,8 +2512,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB24_2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
@@ -2668,8 +2668,8 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a1, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a0, a1
@@ -2692,8 +2692,8 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a1, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a0, a1
@@ -2848,8 +2848,8 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB26_2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
@@ -2886,8 +2886,8 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB26_2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
@@ -3037,8 +3037,8 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a1, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a3, a3, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.2:
@@ -3055,16 +3055,16 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a1, 4(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2:
@@ -3087,8 +3087,8 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a1, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a3, a3, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.2:
@@ -3105,16 +3105,16 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a1, 4(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2:
@@ -3278,16 +3278,16 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB28_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 3(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 3(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB28_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2:
@@ -3326,16 +3326,16 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB28_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 3(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 3(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB28_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2:
@@ -4449,25 +4449,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV32-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a1, 3(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 3(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV32-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV32-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV32-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV32-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV32-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV32-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV32-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV32-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV32-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-NEXT:    ret
@@ -4475,25 +4475,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lb a4, 3(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV64-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV64-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV64-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV64-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV64-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV64-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-NEXT:    ret
@@ -4501,25 +4501,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a1, 3(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 3(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
@@ -4527,25 +4527,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a4, 3(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
@@ -4556,16 +4556,16 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a3, 1(a1)
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a4, 2(a1)
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a1, 3(a1)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a5, 0(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a6, 1(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a7, 2(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a5, 1(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a6, 2(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a7, 3(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a0, 0(a0)
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a1, a4, a1
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a2, a2, a3
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a3, a6, a7
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a0, a0, a5
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a1, a2, a1
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a0, a7, a0
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a2, a5, a6
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a0, a2, a0
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a0, a0, a3
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
@@ -4576,20 +4576,20 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 1(a1)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a4, 2(a1)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a5, 0(a0)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a6, 1(a0)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a2, a2, a3
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 2(a0)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a5, a5, a6
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a1, a1, 24
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a5, 2(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a2, a3, a4
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a5, a5, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a3, a3, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a3
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a2
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a2
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
@@ -4597,25 +4597,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV32-V-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a1, 3(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 3(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV32-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV32-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV32-V-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV32-V-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-V-NEXT:    ret
@@ -4623,25 +4623,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-V-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    lb a4, 3(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV64-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-V-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
@@ -4784,8 +4784,8 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
@@ -4804,8 +4804,8 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
@@ -4962,8 +4962,8 @@ define i1 @memcmp_gt_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
@@ -4982,8 +4982,8 @@ define i1 @memcmp_gt_zero(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll
index f0c14ccb0d5f23..860c3a94abc0a7 100644
--- a/llvm/test/CodeGen/RISCV/memcmp.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp.ll
@@ -1410,20 +1410,20 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-NEXT:    lw t0, 8(a1)
 ; CHECK-UNALIGNED-RV32-NEXT:    lw t1, 12(a1)
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-NEXT:    lw a6, 16(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw t2, 20(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw t3, 24(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a0, 27(a0)
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-NEXT:    lw a7, 20(a0)
-; CHECK-UNALIGNED-RV32-NEXT:    lw t0, 24(a0)
-; CHECK-UNALIGNED-RV32-NEXT:    lw a0, 27(a0)
-; CHECK-UNALIGNED-RV32-NEXT:    lw t1, 16(a1)
-; CHECK-UNALIGNED-RV32-NEXT:    lw t2, 20(a1)
-; CHECK-UNALIGNED-RV32-NEXT:    lw t3, 24(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a7, 16(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw t0, 20(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw t1, 24(a1)
 ; CHECK-UNALIGNED-RV32-NEXT:    lw a1, 27(a1)
-; CHECK-UNALIGNED-RV32-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV32-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV32-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV32-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV32-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV32-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV32-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV32-NEXT:    or a4, a4, a5
@@ -1466,20 +1466,20 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t0, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t1, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a6, 16(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t2, 20(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t3, 24(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 27(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a7, 20(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t0, 24(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 27(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t1, 16(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t2, 20(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t3, 24(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a7, 16(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t0, 20(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t1, 24(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 27(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a4, a4, a5
@@ -1522,20 +1522,20 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t0, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t1, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a6, 16(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t2, 20(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t3, 24(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 27(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a7, 20(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t0, 24(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 27(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t1, 16(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t2, 20(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t3, 24(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a7, 16(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t0, 20(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t1, 24(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 27(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a4, a4, a5
@@ -1578,20 +1578,20 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 8(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 12(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 16(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 20(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 24(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 27(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 20(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 24(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 27(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 16(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 20(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 24(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 16(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 20(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 24(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 27(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a5
@@ -1719,20 +1719,20 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-NEXT:    lw t0, 8(a1)
 ; CHECK-UNALIGNED-RV32-NEXT:    lw t1, 12(a1)
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-NEXT:    lw a6, 16(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw t2, 20(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw t3, 24(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a0, 28(a0)
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-NEXT:    lw a7, 20(a0)
-; CHECK-UNALIGNED-RV32-NEXT:    lw t0, 24(a0)
-; CHECK-UNALIGNED-RV32-NEXT:    lw a0, 28(a0)
-; CHECK-UNALIGNED-RV32-NEXT:    lw t1, 16(a1)
-; CHECK-UNALIGNED-RV32-NEXT:    lw t2, 20(a1)
-; CHECK-UNALIGNED-RV32-NEXT:    lw t3, 24(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a7, 16(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw t0, 20(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw t1, 24(a1)
 ; CHECK-UNALIGNED-RV32-NEXT:    lw a1, 28(a1)
-; CHECK-UNALIGNED-RV32-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV32-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV32-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV32-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV32-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV32-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV32-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV32-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV32-NEXT:    or a4, a4, a5
@@ -1775,20 +1775,20 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t0, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t1, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a6, 16(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t2, 20(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t3, 24(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 28(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a7, 20(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t0, 24(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 28(a0)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t1, 16(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t2, 20(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t3, 24(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a7, 16(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t0, 20(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw t1, 24(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 28(a1)
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a4, a4, a5
@@ -1831,20 +1831,20 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t0, 8(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t1, 12(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a6, 16(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t2, 20(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t3, 24(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 28(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a7, 20(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t0, 24(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 28(a0)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t1, 16(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t2, 20(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t3, 24(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a7, 16(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t0, 20(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw t1, 24(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 28(a1)
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a4, a4, a5
@@ -1887,20 +1887,20 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 8(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 12(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 16(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 20(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 24(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 28(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 20(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 24(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 28(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 16(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 20(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 24(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 16(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 20(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 24(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 28(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a5
@@ -1998,20 +1998,20 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-NEXT:    ld t0, 16(a1)
 ; CHECK-UNALIGNED-RV64-NEXT:    ld t1, 24(a1)
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-NEXT:    ld a6, 32(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld t2, 40(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld t3, 48(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a0, 55(a0)
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-NEXT:    ld a7, 40(a0)
-; CHECK-UNALIGNED-RV64-NEXT:    ld t0, 48(a0)
-; CHECK-UNALIGNED-RV64-NEXT:    ld a0, 55(a0)
-; CHECK-UNALIGNED-RV64-NEXT:    ld t1, 32(a1)
-; CHECK-UNALIGNED-RV64-NEXT:    ld t2, 40(a1)
-; CHECK-UNALIGNED-RV64-NEXT:    ld t3, 48(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a7, 32(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    ld t0, 40(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    ld t1, 48(a1)
 ; CHECK-UNALIGNED-RV64-NEXT:    ld a1, 55(a1)
-; CHECK-UNALIGNED-RV64-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV64-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV64-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV64-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV64-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV64-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV64-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV64-NEXT:    or a4, a4, a5
@@ -2034,20 +2034,20 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t0, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t1, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a6, 32(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t2, 40(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t3, 48(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 55(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a7, 40(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t0, 48(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 55(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t1, 32(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t2, 40(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t3, 48(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a7, 32(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t0, 40(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t1, 48(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 55(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a4, a4, a5
@@ -2070,20 +2070,20 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t0, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t1, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a6, 32(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t2, 40(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t3, 48(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 55(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a7, 40(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t0, 48(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 55(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t1, 32(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t2, 40(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t3, 48(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a7, 32(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t0, 40(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t1, 48(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 55(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a4, a4, a5
@@ -2106,20 +2106,20 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 16(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 24(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 32(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 40(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 48(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 55(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 40(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 48(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 55(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 32(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 40(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 48(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 32(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 40(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 48(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 55(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, a5
@@ -2197,20 +2197,20 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-NEXT:    ld t0, 16(a1)
 ; CHECK-UNALIGNED-RV64-NEXT:    ld t1, 24(a1)
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-NEXT:    ld a6, 32(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld t2, 40(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld t3, 48(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a0, 56(a0)
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-NEXT:    ld a7, 40(a0)
-; CHECK-UNALIGNED-RV64-NEXT:    ld t0, 48(a0)
-; CHECK-UNALIGNED-RV64-NEXT:    ld a0, 56(a0)
-; CHECK-UNALIGNED-RV64-NEXT:    ld t1, 32(a1)
-; CHECK-UNALIGNED-RV64-NEXT:    ld t2, 40(a1)
-; CHECK-UNALIGNED-RV64-NEXT:    ld t3, 48(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a7, 32(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    ld t0, 40(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    ld t1, 48(a1)
 ; CHECK-UNALIGNED-RV64-NEXT:    ld a1, 56(a1)
-; CHECK-UNALIGNED-RV64-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV64-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV64-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV64-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV64-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV64-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV64-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV64-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV64-NEXT:    or a4, a4, a5
@@ -2233,20 +2233,20 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t0, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t1, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a6, 32(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t2, 40(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t3, 48(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 56(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a7, 40(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t0, 48(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 56(a0)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t1, 32(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t2, 40(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t3, 48(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a7, 32(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t0, 40(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld t1, 48(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 56(a1)
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a4, a4, a5
@@ -2269,20 +2269,20 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t0, 16(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t1, 24(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a6, 32(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t2, 40(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t3, 48(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 56(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a7, 40(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t0, 48(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 56(a0)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t1, 32(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t2, 40(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t3, 48(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a7, 32(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t0, 40(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld t1, 48(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 56(a1)
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a4, a4, a5
@@ -2305,20 +2305,20 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 16(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 24(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 32(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 40(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 48(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 56(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a7
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 40(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 48(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 56(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 32(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 40(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 48(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 32(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 40(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 48(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 56(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, a6, t1
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, a7, t2
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t0, t0, t3
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, a6, a7
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, t2, t0
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t0, t3, t1
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
 ; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
 ; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, a5
@@ -2979,8 +2979,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a0, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a1, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a0, a0, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a0, a0, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a1, a1, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
@@ -2990,8 +2990,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
@@ -3001,8 +3001,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a0, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a1, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a0, a0, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a0, a0, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a1, a1, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
@@ -3012,8 +3012,8 @@ define i32 @memcmp_size_2(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sub a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
@@ -3148,8 +3148,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a2, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a3, a3, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB24_2
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.1: # %loadbb1
@@ -3168,8 +3168,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB24_2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
@@ -3188,8 +3188,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a2, 0(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a3, 0(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a3, a3, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB24_2
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.1: # %loadbb1
@@ -3208,8 +3208,8 @@ define i32 @memcmp_size_3(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB24_2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
@@ -3364,8 +3364,8 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a2, a1, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a0, a1
@@ -3388,8 +3388,8 @@ define i32 @memcmp_size_4(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a2, a1, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a0, a1
@@ -3544,8 +3544,8 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB26_2
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
@@ -3582,8 +3582,8 @@ define i32 @memcmp_size_5(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB26_2
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
@@ -3733,8 +3733,8 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lh a1, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    srli a3, a3, 16
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV32-ZBB-NEXT:  # %bb.2:
@@ -3751,16 +3751,16 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lh a1, 4(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 48
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2:
@@ -3783,8 +3783,8 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lh a1, 4(a1)
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a2, a2, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    srli a3, a3, 16
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV32-ZBKB-NEXT:  # %bb.2:
@@ -3801,16 +3801,16 @@ define i32 @memcmp_size_6(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a0, 4(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lh a1, 4(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 48
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB27_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2:
@@ -3974,16 +3974,16 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB28_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 3(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 3(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    bne a2, a3, .LBB28_3
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:  # %bb.2:
@@ -4022,16 +4022,16 @@ define i32 @memcmp_size_7(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a3, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a2
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB28_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.1: # %loadbb1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 3(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 3(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a2, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a3, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a2, a2, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a3, a3, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    bne a2, a3, .LBB28_3
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:  # %bb.2:
@@ -5809,25 +5809,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a1, 3(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 3(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV32-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV32-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV32-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV32-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV32-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV32-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV32-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV32-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV32-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-NEXT:    ret
@@ -5835,25 +5835,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lb a4, 3(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV64-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV64-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV64-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV64-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV64-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV64-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-NEXT:    ret
@@ -5861,25 +5861,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a1, 3(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 3(a1)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
@@ -5887,25 +5887,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a4, 3(a1)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
@@ -5916,16 +5916,16 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a3, 1(a1)
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a4, 2(a1)
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a1, 3(a1)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a5, 0(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a6, 1(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a7, 2(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a5, 1(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a6, 2(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a7, 3(a0)
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a0, 0(a0)
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a1, a4, a1
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a2, a2, a3
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a3, a6, a7
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a0, a0, a5
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a1, a2, a1
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a0, a7, a0
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a2, a5, a6
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a0, a2, a0
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a0, a0, a3
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
@@ -5936,20 +5936,20 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 1(a1)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a4, 2(a1)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a5, 0(a0)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a6, 1(a0)
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a2, a2, a3
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 2(a0)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a5, a5, a6
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a1, a1, 24
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a5, 2(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a2, a3, a4
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a5, a5, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a3, a3, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a3
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a2
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a2
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
@@ -5957,25 +5957,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-V-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a1, 3(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 3(a1)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV32-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV32-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    lbu a0, 3(a0)
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV32-V-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV32-V-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV32-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-V-NEXT:    ret
@@ -5983,25 +5983,25 @@ define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-V-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 0(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lb a1, 3(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 2(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    lb a4, 3(a1)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a1, 0(a1)
 ; CHECK-ALIGNED-RV64-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 0(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 1(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 2(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a3, a3, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 24
+; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a2, a1
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 1(a0)
 ; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a2, a2, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    lb a0, 3(a0)
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a5, a2
+; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    slli a0, a0, 24
+; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a4
+; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a3, a1
 ; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a3
 ; CHECK-ALIGNED-RV64-V-NEXT:    xor a0, a0, a1
 ; CHECK-ALIGNED-RV64-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
@@ -6144,8 +6144,8 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
@@ -6164,8 +6164,8 @@ define i1 @memcmp_lt_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a0, a1
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
@@ -6322,8 +6322,8 @@ define i1 @memcmp_gt_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sltu a0, a1, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
@@ -6342,8 +6342,8 @@ define i1 @memcmp_gt_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a0, 0(a0)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    lw a1, 0(a1)
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a0, a0
-; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    rev8 a1, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a0, a0, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    srli a1, a1, 32
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sltu a0, a1, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/memcpy.ll b/llvm/test/CodeGen/RISCV/memcpy.ll
index 5b8955ee0e0a0c..1ab3722080f700 100644
--- a/llvm/test/CodeGen/RISCV/memcpy.ll
+++ b/llvm/test/CodeGen/RISCV/memcpy.ll
@@ -26,8 +26,8 @@ define i32 @t0() {
 ; RV32-NEXT:    lui a0, %hi(src)
 ; RV32-NEXT:    lw a1, %lo(src)(a0)
 ; RV32-NEXT:    lui a2, %hi(dst)
-; RV32-NEXT:    sw a1, %lo(dst)(a2)
 ; RV32-NEXT:    addi a0, a0, %lo(src)
+; RV32-NEXT:    sw a1, %lo(dst)(a2)
 ; RV32-NEXT:    lw a1, 4(a0)
 ; RV32-NEXT:    lh a3, 8(a0)
 ; RV32-NEXT:    lbu a0, 10(a0)
@@ -41,13 +41,13 @@ define i32 @t0() {
 ; RV64-LABEL: t0:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    lui a0, %hi(src)
-; RV64-NEXT:    ld a1, %lo(src)(a0)
-; RV64-NEXT:    lui a2, %hi(dst)
+; RV64-NEXT:    lui a1, %hi(dst)
+; RV64-NEXT:    ld a2, %lo(src)(a0)
 ; RV64-NEXT:    addi a0, a0, %lo(src)
 ; RV64-NEXT:    lh a3, 8(a0)
 ; RV64-NEXT:    lbu a0, 10(a0)
-; RV64-NEXT:    sd a1, %lo(dst)(a2)
-; RV64-NEXT:    addi a1, a2, %lo(dst)
+; RV64-NEXT:    sd a2, %lo(dst)(a1)
+; RV64-NEXT:    addi a1, a1, %lo(dst)
 ; RV64-NEXT:    sh a3, 8(a1)
 ; RV64-NEXT:    sb a0, 10(a1)
 ; RV64-NEXT:    li a0, 0
@@ -103,29 +103,29 @@ define void @t1(ptr nocapture %C) nounwind {
 ; RV32-FAST-LABEL: t1:
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    lui a1, 1141
+; RV32-FAST-NEXT:    lui a2, 300325
+; RV32-FAST-NEXT:    lui a3, 132181
+; RV32-FAST-NEXT:    lui a4, 340483
+; RV32-FAST-NEXT:    lui a5, 267556
+; RV32-FAST-NEXT:    lui a6, 337154
 ; RV32-FAST-NEXT:    addi a1, a1, -439
 ; RV32-FAST-NEXT:    sw a1, 27(a0)
-; RV32-FAST-NEXT:    lui a1, 300325
+; RV32-FAST-NEXT:    lui a1, 320757
+; RV32-FAST-NEXT:    addi a2, a2, 1107
+; RV32-FAST-NEXT:    addi a3, a3, -689
+; RV32-FAST-NEXT:    addi a4, a4, -947
+; RV32-FAST-NEXT:    sw a4, 16(a0)
+; RV32-FAST-NEXT:    sw a3, 20(a0)
+; RV32-FAST-NEXT:    sw a2, 24(a0)
+; RV32-FAST-NEXT:    lui a2, 365861
+; RV32-FAST-NEXT:    addi a3, a5, 1871
+; RV32-FAST-NEXT:    addi a4, a6, 69
 ; RV32-FAST-NEXT:    addi a1, a1, 1107
-; RV32-FAST-NEXT:    lui a2, 132181
-; RV32-FAST-NEXT:    addi a2, a2, -689
-; RV32-FAST-NEXT:    lui a3, 340483
-; RV32-FAST-NEXT:    addi a3, a3, -947
-; RV32-FAST-NEXT:    sw a3, 16(a0)
-; RV32-FAST-NEXT:    sw a2, 20(a0)
-; RV32-FAST-NEXT:    sw a1, 24(a0)
-; RV32-FAST-NEXT:    lui a1, 267556
-; RV32-FAST-NEXT:    addi a1, a1, 1871
-; RV32-FAST-NEXT:    lui a2, 337154
-; RV32-FAST-NEXT:    addi a2, a2, 69
-; RV32-FAST-NEXT:    lui a3, 320757
-; RV32-FAST-NEXT:    addi a3, a3, 1107
-; RV32-FAST-NEXT:    lui a4, 365861
-; RV32-FAST-NEXT:    addi a4, a4, -1980
-; RV32-FAST-NEXT:    sw a4, 0(a0)
-; RV32-FAST-NEXT:    sw a3, 4(a0)
-; RV32-FAST-NEXT:    sw a2, 8(a0)
-; RV32-FAST-NEXT:    sw a1, 12(a0)
+; RV32-FAST-NEXT:    addi a2, a2, -1980
+; RV32-FAST-NEXT:    sw a2, 0(a0)
+; RV32-FAST-NEXT:    sw a1, 4(a0)
+; RV32-FAST-NEXT:    sw a4, 8(a0)
+; RV32-FAST-NEXT:    sw a3, 12(a0)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: t1:
@@ -164,16 +164,16 @@ define void @t2(ptr nocapture %C) nounwind {
 ; RV64-FAST-LABEL: t2:
 ; RV64-FAST:       # %bb.0: # %entry
 ; RV64-FAST-NEXT:    lui a1, %hi(.L.str2)
-; RV64-FAST-NEXT:    ld a2, %lo(.L.str2)(a1)
-; RV64-FAST-NEXT:    lui a3, 1156
-; RV64-FAST-NEXT:    addi a3, a3, 332
-; RV64-FAST-NEXT:    sw a3, 32(a0)
+; RV64-FAST-NEXT:    lui a2, 1156
+; RV64-FAST-NEXT:    ld a3, %lo(.L.str2)(a1)
+; RV64-FAST-NEXT:    addi a2, a2, 332
 ; RV64-FAST-NEXT:    addi a1, a1, %lo(.L.str2)
-; RV64-FAST-NEXT:    ld a3, 8(a1)
+; RV64-FAST-NEXT:    sw a2, 32(a0)
+; RV64-FAST-NEXT:    ld a2, 8(a1)
 ; RV64-FAST-NEXT:    ld a4, 16(a1)
 ; RV64-FAST-NEXT:    ld a1, 24(a1)
-; RV64-FAST-NEXT:    sd a2, 0(a0)
-; RV64-FAST-NEXT:    sd a3, 8(a0)
+; RV64-FAST-NEXT:    sd a3, 0(a0)
+; RV64-FAST-NEXT:    sd a2, 8(a0)
 ; RV64-FAST-NEXT:    sd a4, 16(a0)
 ; RV64-FAST-NEXT:    sd a1, 24(a0)
 ; RV64-FAST-NEXT:    ret
@@ -200,23 +200,23 @@ define void @t3(ptr nocapture %C) nounwind {
 ; RV32-FAST-LABEL: t3:
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    lui a1, 1109
-; RV32-FAST-NEXT:    addi a1, a1, -689
 ; RV32-FAST-NEXT:    lui a2, 340483
+; RV32-FAST-NEXT:    lui a3, 267556
+; RV32-FAST-NEXT:    lui a4, 337154
+; RV32-FAST-NEXT:    lui a5, 320757
+; RV32-FAST-NEXT:    addi a1, a1, -689
 ; RV32-FAST-NEXT:    addi a2, a2, -947
 ; RV32-FAST-NEXT:    sw a2, 16(a0)
 ; RV32-FAST-NEXT:    sw a1, 20(a0)
-; RV32-FAST-NEXT:    lui a1, 267556
-; RV32-FAST-NEXT:    addi a1, a1, 1871
-; RV32-FAST-NEXT:    lui a2, 337154
-; RV32-FAST-NEXT:    addi a2, a2, 69
-; RV32-FAST-NEXT:    lui a3, 320757
-; RV32-FAST-NEXT:    addi a3, a3, 1107
-; RV32-FAST-NEXT:    lui a4, 365861
-; RV32-FAST-NEXT:    addi a4, a4, -1980
-; RV32-FAST-NEXT:    sw a4, 0(a0)
-; RV32-FAST-NEXT:    sw a3, 4(a0)
-; RV32-FAST-NEXT:    sw a2, 8(a0)
-; RV32-FAST-NEXT:    sw a1, 12(a0)
+; RV32-FAST-NEXT:    lui a1, 365861
+; RV32-FAST-NEXT:    addi a2, a3, 1871
+; RV32-FAST-NEXT:    addi a3, a4, 69
+; RV32-FAST-NEXT:    addi a4, a5, 1107
+; RV32-FAST-NEXT:    addi a1, a1, -1980
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    sw a4, 4(a0)
+; RV32-FAST-NEXT:    sw a3, 8(a0)
+; RV32-FAST-NEXT:    sw a2, 12(a0)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: t3:
@@ -253,19 +253,19 @@ define void @t4(ptr nocapture %C) nounwind {
 ; RV32-FAST-LABEL: t4:
 ; RV32-FAST:       # %bb.0: # %entry
 ; RV32-FAST-NEXT:    li a1, 32
+; RV32-FAST-NEXT:    lui a2, 132388
+; RV32-FAST-NEXT:    lui a3, 337154
+; RV32-FAST-NEXT:    lui a4, 320757
 ; RV32-FAST-NEXT:    sh a1, 16(a0)
-; RV32-FAST-NEXT:    lui a1, 132388
-; RV32-FAST-NEXT:    addi a1, a1, 1871
-; RV32-FAST-NEXT:    lui a2, 337154
-; RV32-FAST-NEXT:    addi a2, a2, 69
-; RV32-FAST-NEXT:    lui a3, 320757
-; RV32-FAST-NEXT:    addi a3, a3, 1107
-; RV32-FAST-NEXT:    lui a4, 365861
-; RV32-FAST-NEXT:    addi a4, a4, -1980
-; RV32-FAST-NEXT:    sw a4, 0(a0)
-; RV32-FAST-NEXT:    sw a3, 4(a0)
-; RV32-FAST-NEXT:    sw a2, 8(a0)
-; RV32-FAST-NEXT:    sw a1, 12(a0)
+; RV32-FAST-NEXT:    lui a1, 365861
+; RV32-FAST-NEXT:    addi a2, a2, 1871
+; RV32-FAST-NEXT:    addi a3, a3, 69
+; RV32-FAST-NEXT:    addi a4, a4, 1107
+; RV32-FAST-NEXT:    addi a1, a1, -1980
+; RV32-FAST-NEXT:    sw a1, 0(a0)
+; RV32-FAST-NEXT:    sw a4, 4(a0)
+; RV32-FAST-NEXT:    sw a3, 8(a0)
+; RV32-FAST-NEXT:    sw a2, 12(a0)
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: t4:
@@ -289,34 +289,34 @@ define void @t5(ptr nocapture %C) nounwind {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    li a1, 84
 ; RV32-NEXT:    li a2, 83
+; RV32-NEXT:    li a3, 89
+; RV32-NEXT:    li a4, 82
+; RV32-NEXT:    li a5, 72
+; RV32-NEXT:    li a6, 68
 ; RV32-NEXT:    sb a2, 4(a0)
 ; RV32-NEXT:    sb a1, 5(a0)
 ; RV32-NEXT:    sb zero, 6(a0)
-; RV32-NEXT:    li a1, 89
-; RV32-NEXT:    li a2, 82
-; RV32-NEXT:    li a3, 72
-; RV32-NEXT:    li a4, 68
-; RV32-NEXT:    sb a4, 0(a0)
-; RV32-NEXT:    sb a3, 1(a0)
-; RV32-NEXT:    sb a2, 2(a0)
-; RV32-NEXT:    sb a1, 3(a0)
+; RV32-NEXT:    sb a6, 0(a0)
+; RV32-NEXT:    sb a5, 1(a0)
+; RV32-NEXT:    sb a4, 2(a0)
+; RV32-NEXT:    sb a3, 3(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: t5:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    li a1, 84
 ; RV64-NEXT:    li a2, 83
+; RV64-NEXT:    li a3, 89
+; RV64-NEXT:    li a4, 82
+; RV64-NEXT:    li a5, 72
+; RV64-NEXT:    li a6, 68
 ; RV64-NEXT:    sb a2, 4(a0)
 ; RV64-NEXT:    sb a1, 5(a0)
 ; RV64-NEXT:    sb zero, 6(a0)
-; RV64-NEXT:    li a1, 89
-; RV64-NEXT:    li a2, 82
-; RV64-NEXT:    li a3, 72
-; RV64-NEXT:    li a4, 68
-; RV64-NEXT:    sb a4, 0(a0)
-; RV64-NEXT:    sb a3, 1(a0)
-; RV64-NEXT:    sb a2, 2(a0)
-; RV64-NEXT:    sb a1, 3(a0)
+; RV64-NEXT:    sb a6, 0(a0)
+; RV64-NEXT:    sb a5, 1(a0)
+; RV64-NEXT:    sb a4, 2(a0)
+; RV64-NEXT:    sb a3, 3(a0)
 ; RV64-NEXT:    ret
 ;
 ; RV32-FAST-LABEL: t5:

diff  --git a/llvm/test/CodeGen/RISCV/misched-mem-clustering.mir b/llvm/test/CodeGen/RISCV/misched-mem-clustering.mir
index 08df378f271897..21398d315ec93a 100644
--- a/llvm/test/CodeGen/RISCV/misched-mem-clustering.mir
+++ b/llvm/test/CodeGen/RISCV/misched-mem-clustering.mir
@@ -1,15 +1,15 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=riscv64 -x mir -mcpu=sifive-p470 -verify-misched -enable-post-misched=false \
 # RUN:     -riscv-postmisched-load-store-clustering=false -debug-only=machine-scheduler \
-# RUN:     -start-before=machine-scheduler -stop-after=postmisched -o - 2>&1 < %s \
+# RUN:     -start-before=machine-scheduler -stop-after=postmisched -misched-regpressure=false -o - 2>&1 < %s \
 # RUN:   | FileCheck -check-prefix=NOPOSTMISCHED %s
 # RUN: llc -mtriple=riscv64 -x mir -mcpu=sifive-p470 -mattr=+use-postra-scheduler -verify-misched -enable-post-misched=true \
 # RUN:     -riscv-postmisched-load-store-clustering=false -debug-only=machine-scheduler \
-# RUN:     -start-before=machine-scheduler -stop-after=postmisched -o - 2>&1 < %s \
+# RUN:     -start-before=machine-scheduler -stop-after=postmisched -misched-regpressure=false -o - 2>&1 < %s \
 # RUN:   | FileCheck -check-prefix=NOCLUSTER %s
 # RUN: llc -mtriple=riscv64 -x mir -mcpu=sifive-p470 -mattr=+use-postra-scheduler -verify-misched -enable-post-misched=true \
 # RUN:     -debug-only=machine-scheduler \
-# RUN:     -start-before=machine-scheduler -stop-after=postmisched -o - 2>&1 < %s \
+# RUN:     -start-before=machine-scheduler -stop-after=postmisched -misched-regpressure=false -o - 2>&1 < %s \
 # RUN:   | FileCheck -check-prefix=MEMCLUSTER %s
 
 # REQUIRES: asserts

diff  --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index e9b84b3cd97ed2..548c7e1c6ea8ce 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -142,8 +142,8 @@ define i64 @mul64(i64 %a, i64 %b) nounwind {
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    mul a3, a0, a3
 ; RV32IM-NEXT:    mulhu a4, a0, a2
-; RV32IM-NEXT:    add a3, a4, a3
 ; RV32IM-NEXT:    mul a1, a1, a2
+; RV32IM-NEXT:    add a3, a4, a3
 ; RV32IM-NEXT:    add a1, a3, a1
 ; RV32IM-NEXT:    mul a0, a0, a2
 ; RV32IM-NEXT:    ret
@@ -163,26 +163,25 @@ define i64 @mul64(i64 %a, i64 %b) nounwind {
 define i64 @mul64_constant(i64 %a) nounwind {
 ; RV32I-LABEL: mul64_constant:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    slli a3, a0, 2
-; RV32I-NEXT:    add a2, a3, a0
-; RV32I-NEXT:    sltu a3, a2, a3
-; RV32I-NEXT:    srli a0, a0, 30
+; RV32I-NEXT:    slli a2, a0, 2
+; RV32I-NEXT:    srli a3, a0, 30
 ; RV32I-NEXT:    slli a4, a1, 2
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    add a1, a0, a3
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sltu a2, a0, a2
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: mul64_constant:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    li a2, 5
-; RV32IM-NEXT:    mulhu a2, a0, a2
 ; RV32IM-NEXT:    slli a3, a1, 2
 ; RV32IM-NEXT:    add a1, a3, a1
+; RV32IM-NEXT:    slli a3, a0, 2
+; RV32IM-NEXT:    mulhu a2, a0, a2
 ; RV32IM-NEXT:    add a1, a2, a1
-; RV32IM-NEXT:    slli a2, a0, 2
-; RV32IM-NEXT:    add a0, a2, a0
+; RV32IM-NEXT:    add a0, a3, a0
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: mul64_constant:
@@ -251,13 +250,13 @@ define i32 @mulhs_positive_constant(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    slli a2, a0, 2
-; RV32I-NEXT:    add a3, a2, a0
-; RV32I-NEXT:    sltu a2, a3, a2
-; RV32I-NEXT:    srli a0, a0, 30
-; RV32I-NEXT:    slli a3, a1, 2
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    srli a3, a0, 30
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    sltu a0, a0, a2
+; RV32I-NEXT:    slli a2, a1, 2
+; RV32I-NEXT:    or a2, a2, a3
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: mulhs_positive_constant:
@@ -293,15 +292,15 @@ define i32 @mulhs_negative_constant(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    slli a2, a0, 2
-; RV32I-NEXT:    add a3, a2, a0
-; RV32I-NEXT:    sltu a2, a3, a2
-; RV32I-NEXT:    srli a0, a0, 30
+; RV32I-NEXT:    srli a3, a0, 30
+; RV32I-NEXT:    add a0, a2, a0
 ; RV32I-NEXT:    slli a4, a1, 2
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    snez a1, a3
-; RV32I-NEXT:    add a1, a2, a1
-; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    sltu a2, a0, a2
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    snez a0, a0
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    ret
 ;
@@ -704,26 +703,25 @@ define i32 @muli32_p63(i32 %a) nounwind {
 define i64 @muli64_p65(i64 %a) nounwind {
 ; RV32I-LABEL: muli64_p65:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    slli a3, a0, 6
-; RV32I-NEXT:    add a2, a3, a0
-; RV32I-NEXT:    sltu a3, a2, a3
-; RV32I-NEXT:    srli a0, a0, 26
+; RV32I-NEXT:    slli a2, a0, 6
+; RV32I-NEXT:    srli a3, a0, 26
 ; RV32I-NEXT:    slli a4, a1, 6
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    add a1, a0, a3
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sltu a2, a0, a2
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli64_p65:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    li a2, 65
-; RV32IM-NEXT:    mulhu a2, a0, a2
 ; RV32IM-NEXT:    slli a3, a1, 6
 ; RV32IM-NEXT:    add a1, a3, a1
+; RV32IM-NEXT:    slli a3, a0, 6
+; RV32IM-NEXT:    mulhu a2, a0, a2
 ; RV32IM-NEXT:    add a1, a2, a1
-; RV32IM-NEXT:    slli a2, a0, 6
-; RV32IM-NEXT:    add a0, a2, a0
+; RV32IM-NEXT:    add a0, a3, a0
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: muli64_p65:
@@ -745,24 +743,24 @@ define i64 @muli64_p63(i64 %a) nounwind {
 ; RV32I-LABEL: muli64_p63:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a0, 6
-; RV32I-NEXT:    sltu a3, a2, a0
-; RV32I-NEXT:    srli a4, a0, 26
-; RV32I-NEXT:    slli a5, a1, 6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    sub a1, a4, a1
-; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    srli a3, a0, 26
+; RV32I-NEXT:    slli a4, a1, 6
+; RV32I-NEXT:    sltu a5, a2, a0
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sub a1, a3, a1
+; RV32I-NEXT:    sub a1, a1, a5
 ; RV32I-NEXT:    sub a0, a2, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli64_p63:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    li a2, 63
-; RV32IM-NEXT:    mulhu a2, a0, a2
 ; RV32IM-NEXT:    slli a3, a1, 6
 ; RV32IM-NEXT:    sub a1, a3, a1
+; RV32IM-NEXT:    slli a3, a0, 6
+; RV32IM-NEXT:    mulhu a2, a0, a2
 ; RV32IM-NEXT:    add a1, a2, a1
-; RV32IM-NEXT:    slli a2, a0, 6
-; RV32IM-NEXT:    sub a0, a2, a0
+; RV32IM-NEXT:    sub a0, a3, a0
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: muli64_p63:
@@ -846,12 +844,12 @@ define i64 @muli64_m63(i64 %a) nounwind {
 ; RV32I-LABEL: muli64_m63:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a0, 6
-; RV32I-NEXT:    sltu a3, a0, a2
-; RV32I-NEXT:    srli a4, a0, 26
-; RV32I-NEXT:    slli a5, a1, 6
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    sub a1, a1, a4
+; RV32I-NEXT:    srli a3, a0, 26
+; RV32I-NEXT:    slli a4, a1, 6
+; RV32I-NEXT:    sltu a5, a0, a2
+; RV32I-NEXT:    or a3, a4, a3
 ; RV32I-NEXT:    sub a1, a1, a3
+; RV32I-NEXT:    sub a1, a1, a5
 ; RV32I-NEXT:    sub a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
@@ -886,17 +884,17 @@ define i64 @muli64_m65(i64 %a) nounwind {
 ; RV32I-LABEL: muli64_m65:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a0, 6
-; RV32I-NEXT:    add a3, a2, a0
-; RV32I-NEXT:    sltu a2, a3, a2
-; RV32I-NEXT:    srli a0, a0, 26
+; RV32I-NEXT:    srli a3, a0, 26
 ; RV32I-NEXT:    slli a4, a1, 6
-; RV32I-NEXT:    or a0, a4, a0
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    add a0, a0, a2
-; RV32I-NEXT:    snez a1, a3
-; RV32I-NEXT:    neg a1, a1
-; RV32I-NEXT:    sub a1, a1, a0
-; RV32I-NEXT:    neg a0, a3
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sltu a2, a0, a2
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    snez a3, a0
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    neg a2, a3
+; RV32I-NEXT:    sub a1, a2, a1
+; RV32I-NEXT:    neg a0, a0
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli64_m65:
@@ -1129,11 +1127,11 @@ define i64 @muli64_p4352(i64 %a) nounwind {
 ; RV32I-NEXT:    srli a3, a0, 20
 ; RV32I-NEXT:    slli a1, a1, 12
 ; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    slli a3, a0, 8
+; RV32I-NEXT:    slli a4, a0, 12
+; RV32I-NEXT:    add a0, a4, a3
 ; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    slli a2, a0, 8
-; RV32I-NEXT:    slli a3, a0, 12
-; RV32I-NEXT:    add a0, a3, a2
-; RV32I-NEXT:    sltu a2, a0, a3
+; RV32I-NEXT:    sltu a2, a0, a4
 ; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
@@ -1173,12 +1171,12 @@ define i64 @muli64_p3840(i64 %a) nounwind {
 ; RV32I-NEXT:    srli a3, a0, 20
 ; RV32I-NEXT:    slli a1, a1, 12
 ; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a0, 8
 ; RV32I-NEXT:    slli a0, a0, 12
-; RV32I-NEXT:    sltu a3, a0, a2
-; RV32I-NEXT:    sub a1, a1, a3
-; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sltu a2, a0, a3
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sub a0, a0, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli64_p3840:
@@ -1261,12 +1259,12 @@ define i64 @muli64_m3840(i64 %a) nounwind {
 ; RV32I-NEXT:    srli a3, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 8
 ; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    slli a2, a0, 12
+; RV32I-NEXT:    slli a3, a0, 12
 ; RV32I-NEXT:    slli a0, a0, 8
-; RV32I-NEXT:    sltu a3, a0, a2
-; RV32I-NEXT:    sub a1, a1, a3
-; RV32I-NEXT:    sub a0, a0, a2
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sltu a2, a0, a3
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sub a0, a0, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli64_m3840:
@@ -1300,105 +1298,103 @@ define i64 @muli64_m3840(i64 %a) nounwind {
 define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV32I-LABEL: muli128_m3840:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a4, 4(a1)
-; RV32I-NEXT:    lw a3, 8(a1)
-; RV32I-NEXT:    lw a6, 0(a1)
-; RV32I-NEXT:    lw a5, 12(a1)
-; RV32I-NEXT:    srli a1, a4, 20
-; RV32I-NEXT:    slli a2, a3, 12
-; RV32I-NEXT:    or a1, a2, a1
-; RV32I-NEXT:    srli a2, a4, 24
-; RV32I-NEXT:    slli a7, a3, 8
-; RV32I-NEXT:    or a2, a7, a2
-; RV32I-NEXT:    sltu t0, a2, a1
-; RV32I-NEXT:    srli a7, a3, 20
-; RV32I-NEXT:    slli t1, a5, 12
-; RV32I-NEXT:    or a7, t1, a7
-; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    slli a5, a5, 8
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    sub t1, a3, a7
-; RV32I-NEXT:    srli a3, a6, 20
-; RV32I-NEXT:    slli a5, a4, 12
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    srli a5, a6, 24
+; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a2, 8(a1)
+; RV32I-NEXT:    lw a5, 0(a1)
+; RV32I-NEXT:    lw a4, 12(a1)
+; RV32I-NEXT:    srli a1, a3, 20
+; RV32I-NEXT:    slli a6, a2, 12
+; RV32I-NEXT:    srli a7, a3, 24
+; RV32I-NEXT:    slli t0, a2, 8
+; RV32I-NEXT:    srli t1, a2, 20
+; RV32I-NEXT:    or a1, a6, a1
+; RV32I-NEXT:    slli a6, a4, 12
+; RV32I-NEXT:    srli t2, a2, 24
 ; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a5, a4, a5
-; RV32I-NEXT:    slli a4, a6, 12
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    sltu a7, a6, a4
-; RV32I-NEXT:    sub t0, t1, t0
+; RV32I-NEXT:    or a2, t0, a7
+; RV32I-NEXT:    srli a7, a5, 20
+; RV32I-NEXT:    or a6, a6, t1
+; RV32I-NEXT:    slli t0, a3, 12
+; RV32I-NEXT:    or t1, a4, t2
+; RV32I-NEXT:    srli t2, a5, 24
+; RV32I-NEXT:    slli t3, a3, 8
+; RV32I-NEXT:    or a3, t0, a7
+; RV32I-NEXT:    slli a4, a5, 12
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or t0, t3, t2
+; RV32I-NEXT:    sltu t2, a2, a1
+; RV32I-NEXT:    sub a6, t1, a6
+; RV32I-NEXT:    sltu a7, a5, a4
+; RV32I-NEXT:    sub a6, a6, t2
 ; RV32I-NEXT:    mv t1, a7
-; RV32I-NEXT:    beq a5, a3, .LBB36_2
+; RV32I-NEXT:    beq t0, a3, .LBB36_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t1, a5, a3
+; RV32I-NEXT:    sltu t1, t0, a3
 ; RV32I-NEXT:  .LBB36_2:
 ; RV32I-NEXT:    sub a2, a2, a1
-; RV32I-NEXT:    sltu a1, a2, t1
-; RV32I-NEXT:    sub a1, t0, a1
+; RV32I-NEXT:    sub a1, t0, a3
+; RV32I-NEXT:    sub a5, a5, a4
+; RV32I-NEXT:    sltu a3, a2, t1
 ; RV32I-NEXT:    sub a2, a2, t1
-; RV32I-NEXT:    sub a5, a5, a3
-; RV32I-NEXT:    sub a3, a5, a7
-; RV32I-NEXT:    sub a4, a6, a4
-; RV32I-NEXT:    sw a4, 0(a0)
-; RV32I-NEXT:    sw a3, 4(a0)
+; RV32I-NEXT:    sub a1, a1, a7
+; RV32I-NEXT:    sub a3, a6, a3
+; RV32I-NEXT:    sw a5, 0(a0)
+; RV32I-NEXT:    sw a1, 4(a0)
 ; RV32I-NEXT:    sw a2, 8(a0)
-; RV32I-NEXT:    sw a1, 12(a0)
+; RV32I-NEXT:    sw a3, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli128_m3840:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    addi sp, sp, -16
 ; RV32IM-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw a2, 0(a1)
-; RV32IM-NEXT:    lw a3, 4(a1)
+; RV32IM-NEXT:    lw a3, 0(a1)
+; RV32IM-NEXT:    lw a2, 4(a1)
 ; RV32IM-NEXT:    lw a4, 8(a1)
 ; RV32IM-NEXT:    lw a1, 12(a1)
 ; RV32IM-NEXT:    li a5, -15
+; RV32IM-NEXT:    li a6, -1
 ; RV32IM-NEXT:    slli a5, a5, 8
-; RV32IM-NEXT:    mulhu a6, a2, a5
-; RV32IM-NEXT:    mul a7, a3, a5
-; RV32IM-NEXT:    add a6, a7, a6
-; RV32IM-NEXT:    sltu a7, a6, a7
-; RV32IM-NEXT:    mulhu t0, a3, a5
-; RV32IM-NEXT:    add a7, t0, a7
-; RV32IM-NEXT:    sub a6, a6, a2
-; RV32IM-NEXT:    neg t0, a2
-; RV32IM-NEXT:    sltu t1, a6, t0
-; RV32IM-NEXT:    li t2, -1
-; RV32IM-NEXT:    mulhu t3, a2, t2
-; RV32IM-NEXT:    add t1, t3, t1
-; RV32IM-NEXT:    add t1, a7, t1
-; RV32IM-NEXT:    sub t4, t1, a3
-; RV32IM-NEXT:    mul t5, a4, a5
-; RV32IM-NEXT:    sub t5, t5, a2
-; RV32IM-NEXT:    add t6, t4, t5
-; RV32IM-NEXT:    sltu s0, t6, t4
-; RV32IM-NEXT:    neg s1, a3
-; RV32IM-NEXT:    sltu t4, t4, s1
-; RV32IM-NEXT:    sltu a7, t1, a7
-; RV32IM-NEXT:    mulhu t1, a3, t2
-; RV32IM-NEXT:    add a7, t1, a7
-; RV32IM-NEXT:    add a7, a7, t4
-; RV32IM-NEXT:    sltu t0, t5, t0
+; RV32IM-NEXT:    mulhu a7, a3, a5
+; RV32IM-NEXT:    mul t0, a2, a5
+; RV32IM-NEXT:    mulhu t1, a2, a5
+; RV32IM-NEXT:    neg t2, a3
+; RV32IM-NEXT:    mulhu t3, a3, a6
+; RV32IM-NEXT:    mul t4, a4, a5
+; RV32IM-NEXT:    neg t5, a2
+; RV32IM-NEXT:    mulhu a6, a2, a6
 ; RV32IM-NEXT:    mul a1, a1, a5
-; RV32IM-NEXT:    mulhu t1, a4, a5
-; RV32IM-NEXT:    sub a4, t1, a4
+; RV32IM-NEXT:    mulhu t6, a4, a5
+; RV32IM-NEXT:    add s0, a3, a2
+; RV32IM-NEXT:    mul a5, a3, a5
+; RV32IM-NEXT:    add a7, t0, a7
+; RV32IM-NEXT:    sub t4, t4, a3
+; RV32IM-NEXT:    sub a4, t6, a4
+; RV32IM-NEXT:    sub t6, t3, s0
+; RV32IM-NEXT:    sltu t0, a7, t0
+; RV32IM-NEXT:    sub a3, a7, a3
+; RV32IM-NEXT:    sltu a7, t4, t2
 ; RV32IM-NEXT:    add a1, a4, a1
-; RV32IM-NEXT:    add a3, a2, a3
-; RV32IM-NEXT:    sub a3, t3, a3
-; RV32IM-NEXT:    add a1, a3, a1
-; RV32IM-NEXT:    add a1, a1, t0
-; RV32IM-NEXT:    add a1, a7, a1
-; RV32IM-NEXT:    add a1, a1, s0
-; RV32IM-NEXT:    mul a2, a2, a5
-; RV32IM-NEXT:    sw a2, 0(a0)
-; RV32IM-NEXT:    sw a6, 4(a0)
-; RV32IM-NEXT:    sw t6, 8(a0)
+; RV32IM-NEXT:    add t0, t1, t0
+; RV32IM-NEXT:    sltu a4, a3, t2
+; RV32IM-NEXT:    add a1, t6, a1
+; RV32IM-NEXT:    add a4, t3, a4
+; RV32IM-NEXT:    add a1, a1, a7
+; RV32IM-NEXT:    add a4, t0, a4
+; RV32IM-NEXT:    sub a2, a4, a2
+; RV32IM-NEXT:    sltu a4, a4, t0
+; RV32IM-NEXT:    add t4, a2, t4
+; RV32IM-NEXT:    sltu a7, a2, t5
+; RV32IM-NEXT:    add a4, a6, a4
+; RV32IM-NEXT:    sltu a2, t4, a2
+; RV32IM-NEXT:    add a4, a4, a7
+; RV32IM-NEXT:    add a1, a4, a1
+; RV32IM-NEXT:    add a1, a1, a2
+; RV32IM-NEXT:    sw a5, 0(a0)
+; RV32IM-NEXT:    sw a3, 4(a0)
+; RV32IM-NEXT:    sw t4, 8(a0)
 ; RV32IM-NEXT:    sw a1, 12(a0)
 ; RV32IM-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32IM-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32IM-NEXT:    addi sp, sp, 16
 ; RV32IM-NEXT:    ret
 ;
@@ -1410,12 +1406,12 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV64I-NEXT:    srli a3, a0, 56
 ; RV64I-NEXT:    slli a1, a1, 8
 ; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    sub a1, a1, a2
-; RV64I-NEXT:    slli a2, a0, 12
+; RV64I-NEXT:    slli a3, a0, 12
 ; RV64I-NEXT:    slli a0, a0, 8
-; RV64I-NEXT:    sltu a3, a0, a2
-; RV64I-NEXT:    sub a1, a1, a3
-; RV64I-NEXT:    sub a0, a0, a2
+; RV64I-NEXT:    sub a1, a1, a2
+; RV64I-NEXT:    sltu a2, a0, a3
+; RV64I-NEXT:    sub a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a3
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: muli128_m3840:
@@ -1435,40 +1431,40 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 define i128 @muli128_m63(i128 %a) nounwind {
 ; RV32I-LABEL: muli128_m63:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a2, 0(a1)
-; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a6, 8(a1)
-; RV32I-NEXT:    lw a5, 12(a1)
-; RV32I-NEXT:    slli a1, a2, 6
-; RV32I-NEXT:    sltu a4, a2, a1
-; RV32I-NEXT:    srli a7, a2, 26
-; RV32I-NEXT:    slli t0, a3, 6
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    mv t0, a4
-; RV32I-NEXT:    beq a3, a7, .LBB37_2
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw a2, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    slli a6, a3, 6
+; RV32I-NEXT:    srli a5, a3, 26
+; RV32I-NEXT:    slli t0, a4, 6
+; RV32I-NEXT:    sltu a7, a3, a6
+; RV32I-NEXT:    or t0, t0, a5
+; RV32I-NEXT:    mv a5, a7
+; RV32I-NEXT:    beq a4, t0, .LBB37_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    sltu t0, a3, a7
+; RV32I-NEXT:    sltu a5, a4, t0
 ; RV32I-NEXT:  .LBB37_2:
-; RV32I-NEXT:    srli t1, a3, 26
-; RV32I-NEXT:    slli t2, a6, 6
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    sub t2, a6, t1
-; RV32I-NEXT:    sltu t3, t2, t0
-; RV32I-NEXT:    sltu t1, a6, t1
-; RV32I-NEXT:    srli a6, a6, 26
-; RV32I-NEXT:    slli t4, a5, 6
-; RV32I-NEXT:    or a6, t4, a6
-; RV32I-NEXT:    sub a5, a5, a6
-; RV32I-NEXT:    sub a5, a5, t1
-; RV32I-NEXT:    sub a5, a5, t3
-; RV32I-NEXT:    sub a6, t2, t0
-; RV32I-NEXT:    sub a3, a3, a7
-; RV32I-NEXT:    sub a3, a3, a4
-; RV32I-NEXT:    sub a2, a2, a1
-; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a6, 8(a0)
-; RV32I-NEXT:    sw a5, 12(a0)
+; RV32I-NEXT:    srli t1, a4, 26
+; RV32I-NEXT:    slli t2, a2, 6
+; RV32I-NEXT:    srli t3, a2, 26
+; RV32I-NEXT:    slli t4, a1, 6
+; RV32I-NEXT:    sub a4, a4, t0
+; RV32I-NEXT:    sub a3, a3, a6
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or t0, t4, t3
+; RV32I-NEXT:    sub a4, a4, a7
+; RV32I-NEXT:    sub a7, a2, a6
+; RV32I-NEXT:    sltu a2, a2, a6
+; RV32I-NEXT:    sub a1, a1, t0
+; RV32I-NEXT:    sltu a6, a7, a5
+; RV32I-NEXT:    sub a1, a1, a2
+; RV32I-NEXT:    sub a2, a7, a5
+; RV32I-NEXT:    sub a1, a1, a6
+; RV32I-NEXT:    sw a3, 0(a0)
+; RV32I-NEXT:    sw a4, 4(a0)
+; RV32I-NEXT:    sw a2, 8(a0)
+; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV32IM-LABEL: muli128_m63:
@@ -1476,54 +1472,54 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV32IM-NEXT:    addi sp, sp, -16
 ; RV32IM-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
 ; RV32IM-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
-; RV32IM-NEXT:    lw a2, 0(a1)
-; RV32IM-NEXT:    lw a3, 4(a1)
+; RV32IM-NEXT:    lw a3, 0(a1)
+; RV32IM-NEXT:    lw a2, 4(a1)
 ; RV32IM-NEXT:    lw a4, 8(a1)
 ; RV32IM-NEXT:    lw a1, 12(a1)
 ; RV32IM-NEXT:    li a5, -63
-; RV32IM-NEXT:    mulhu a6, a2, a5
-; RV32IM-NEXT:    slli a7, a3, 6
-; RV32IM-NEXT:    sub a7, a3, a7
-; RV32IM-NEXT:    add a6, a7, a6
-; RV32IM-NEXT:    sltu a7, a6, a7
-; RV32IM-NEXT:    mulhu t0, a3, a5
-; RV32IM-NEXT:    add a7, t0, a7
-; RV32IM-NEXT:    sub a6, a6, a2
-; RV32IM-NEXT:    neg t0, a2
-; RV32IM-NEXT:    sltu t1, a6, t0
-; RV32IM-NEXT:    li t2, -1
-; RV32IM-NEXT:    mulhu t3, a2, t2
-; RV32IM-NEXT:    add t1, t3, t1
-; RV32IM-NEXT:    add t1, a7, t1
-; RV32IM-NEXT:    sub t4, t1, a3
-; RV32IM-NEXT:    slli t5, a4, 6
-; RV32IM-NEXT:    sub t6, a4, a2
-; RV32IM-NEXT:    sub t5, t6, t5
-; RV32IM-NEXT:    add t6, t4, t5
-; RV32IM-NEXT:    sltu s0, t6, t4
-; RV32IM-NEXT:    neg s1, a3
-; RV32IM-NEXT:    sltu t4, t4, s1
-; RV32IM-NEXT:    sltu a7, t1, a7
-; RV32IM-NEXT:    mulhu t1, a3, t2
-; RV32IM-NEXT:    add a7, t1, a7
-; RV32IM-NEXT:    add a7, a7, t4
-; RV32IM-NEXT:    sltu t0, t5, t0
-; RV32IM-NEXT:    slli t1, a1, 6
-; RV32IM-NEXT:    sub a1, a1, t1
+; RV32IM-NEXT:    li a6, -1
+; RV32IM-NEXT:    mulhu a7, a3, a5
+; RV32IM-NEXT:    slli t0, a2, 6
+; RV32IM-NEXT:    mulhu t1, a2, a5
+; RV32IM-NEXT:    neg t2, a3
+; RV32IM-NEXT:    mulhu t3, a3, a6
+; RV32IM-NEXT:    slli t4, a4, 6
+; RV32IM-NEXT:    sub t5, a4, a3
+; RV32IM-NEXT:    neg t6, a2
+; RV32IM-NEXT:    mulhu a6, a2, a6
+; RV32IM-NEXT:    slli s0, a1, 6
 ; RV32IM-NEXT:    mulhu a5, a4, a5
+; RV32IM-NEXT:    add s1, a3, a2
+; RV32IM-NEXT:    sub t4, t5, t4
+; RV32IM-NEXT:    slli t5, a3, 6
+; RV32IM-NEXT:    sub t0, a2, t0
+; RV32IM-NEXT:    sub a1, a1, s0
 ; RV32IM-NEXT:    sub a5, a5, a4
+; RV32IM-NEXT:    sub a4, t3, s1
+; RV32IM-NEXT:    sub t5, a3, t5
+; RV32IM-NEXT:    add a7, t0, a7
+; RV32IM-NEXT:    sltu s0, t4, t2
 ; RV32IM-NEXT:    add a1, a5, a1
-; RV32IM-NEXT:    add a3, a2, a3
-; RV32IM-NEXT:    sub a3, t3, a3
-; RV32IM-NEXT:    add a1, a3, a1
-; RV32IM-NEXT:    add a1, a1, t0
-; RV32IM-NEXT:    add a1, a7, a1
+; RV32IM-NEXT:    sltu a5, a7, t0
+; RV32IM-NEXT:    sub a3, a7, a3
+; RV32IM-NEXT:    add a1, a4, a1
+; RV32IM-NEXT:    add a5, t1, a5
+; RV32IM-NEXT:    sltu a4, a3, t2
 ; RV32IM-NEXT:    add a1, a1, s0
-; RV32IM-NEXT:    slli a3, a2, 6
-; RV32IM-NEXT:    sub a2, a2, a3
-; RV32IM-NEXT:    sw a2, 0(a0)
-; RV32IM-NEXT:    sw a6, 4(a0)
-; RV32IM-NEXT:    sw t6, 8(a0)
+; RV32IM-NEXT:    add a4, t3, a4
+; RV32IM-NEXT:    add a4, a5, a4
+; RV32IM-NEXT:    sub a2, a4, a2
+; RV32IM-NEXT:    sltu a4, a4, a5
+; RV32IM-NEXT:    add t4, a2, t4
+; RV32IM-NEXT:    sltu a5, a2, t6
+; RV32IM-NEXT:    add a4, a6, a4
+; RV32IM-NEXT:    sltu a2, t4, a2
+; RV32IM-NEXT:    add a4, a4, a5
+; RV32IM-NEXT:    add a1, a4, a1
+; RV32IM-NEXT:    add a1, a1, a2
+; RV32IM-NEXT:    sw t5, 0(a0)
+; RV32IM-NEXT:    sw a3, 4(a0)
+; RV32IM-NEXT:    sw t4, 8(a0)
 ; RV32IM-NEXT:    sw a1, 12(a0)
 ; RV32IM-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32IM-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -1533,12 +1529,12 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV64I-LABEL: muli128_m63:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a2, a0, 6
-; RV64I-NEXT:    sltu a3, a0, a2
-; RV64I-NEXT:    srli a4, a0, 58
-; RV64I-NEXT:    slli a5, a1, 6
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    sub a1, a1, a4
+; RV64I-NEXT:    srli a3, a0, 58
+; RV64I-NEXT:    slli a4, a1, 6
+; RV64I-NEXT:    sltu a5, a0, a2
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    sub a1, a1, a3
+; RV64I-NEXT:    sub a1, a1, a5
 ; RV64I-NEXT:    sub a0, a0, a2
 ; RV64I-NEXT:    ret
 ;
@@ -1619,17 +1615,17 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:    mv a3, s0
 ; RV32I-NEXT:    call __muldi3
 ; RV32I-NEXT:    add s2, a0, s2
-; RV32I-NEXT:    add a2, s9, s2
-; RV32I-NEXT:    sltu a3, a2, s9
-; RV32I-NEXT:    sltu a4, s9, s5
-; RV32I-NEXT:    sltu a5, s8, s7
-; RV32I-NEXT:    add a5, s6, a5
-; RV32I-NEXT:    add a4, a5, a4
+; RV32I-NEXT:    sltu a3, s9, s5
+; RV32I-NEXT:    sltu a4, s8, s7
 ; RV32I-NEXT:    add a1, a1, s3
+; RV32I-NEXT:    add a2, s9, s2
+; RV32I-NEXT:    add a4, s6, a4
 ; RV32I-NEXT:    sltu a0, s2, a0
+; RV32I-NEXT:    sltu a5, a2, s9
+; RV32I-NEXT:    add a3, a4, a3
 ; RV32I-NEXT:    add a0, a1, a0
-; RV32I-NEXT:    add a0, a4, a0
-; RV32I-NEXT:    add a1, a0, a3
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    add a1, a0, a5
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
@@ -1650,33 +1646,32 @@ define i64 @mulhsu_i64(i64 %a, i64 %b) nounwind {
 ; RV32IM-NEXT:    srai a4, a3, 31
 ; RV32IM-NEXT:    mulhu a5, a0, a2
 ; RV32IM-NEXT:    mul a6, a1, a2
-; RV32IM-NEXT:    add a5, a6, a5
-; RV32IM-NEXT:    sltu a6, a5, a6
 ; RV32IM-NEXT:    mulhu a2, a1, a2
-; RV32IM-NEXT:    add a6, a2, a6
-; RV32IM-NEXT:    mul a2, a0, a3
-; RV32IM-NEXT:    add a5, a2, a5
-; RV32IM-NEXT:    sltu a2, a5, a2
-; RV32IM-NEXT:    mulhu a5, a0, a3
-; RV32IM-NEXT:    add a2, a5, a2
-; RV32IM-NEXT:    add a5, a6, a2
-; RV32IM-NEXT:    mul a7, a1, a3
-; RV32IM-NEXT:    add t0, a7, a5
-; RV32IM-NEXT:    mul t1, a4, a0
-; RV32IM-NEXT:    add a2, t0, t1
-; RV32IM-NEXT:    sltu t2, a2, t0
-; RV32IM-NEXT:    sltu a7, t0, a7
-; RV32IM-NEXT:    sltu a5, a5, a6
+; RV32IM-NEXT:    mul a7, a0, a3
+; RV32IM-NEXT:    mulhu t0, a0, a3
+; RV32IM-NEXT:    mul t1, a1, a3
 ; RV32IM-NEXT:    mulhu a3, a1, a3
-; RV32IM-NEXT:    add a3, a3, a5
-; RV32IM-NEXT:    add a3, a3, a7
+; RV32IM-NEXT:    add a5, a6, a5
+; RV32IM-NEXT:    mul t2, a4, a0
 ; RV32IM-NEXT:    mul a1, a4, a1
 ; RV32IM-NEXT:    mulhu a0, a4, a0
-; RV32IM-NEXT:    add a0, a0, a1
-; RV32IM-NEXT:    add a0, a0, t1
-; RV32IM-NEXT:    add a0, a3, a0
-; RV32IM-NEXT:    add a1, a0, t2
-; RV32IM-NEXT:    mv a0, a2
+; RV32IM-NEXT:    sltu a4, a5, a6
+; RV32IM-NEXT:    add a5, a7, a5
+; RV32IM-NEXT:    add a1, a0, a1
+; RV32IM-NEXT:    add a2, a2, a4
+; RV32IM-NEXT:    sltu a0, a5, a7
+; RV32IM-NEXT:    add a0, t0, a0
+; RV32IM-NEXT:    add a0, a2, a0
+; RV32IM-NEXT:    add a4, t1, a0
+; RV32IM-NEXT:    sltu a2, a0, a2
+; RV32IM-NEXT:    add a0, a4, t2
+; RV32IM-NEXT:    sltu a5, a4, t1
+; RV32IM-NEXT:    add a2, a3, a2
+; RV32IM-NEXT:    sltu a3, a0, a4
+; RV32IM-NEXT:    add a2, a2, a5
+; RV32IM-NEXT:    add a1, a1, t2
+; RV32IM-NEXT:    add a1, a2, a1
+; RV32IM-NEXT:    add a1, a1, a3
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: mulhsu_i64:

diff  --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll
index 676b4134461163..7d6a6d7ed4ce64 100644
--- a/llvm/test/CodeGen/RISCV/neg-abs.ll
+++ b/llvm/test/CodeGen/RISCV/neg-abs.ll
@@ -81,8 +81,8 @@ define i64 @neg_abs64(i64 %x) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srai a2, a1, 31
 ; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    sltu a3, a2, a0
 ; RV32I-NEXT:    xor a1, a1, a2
+; RV32I-NEXT:    sltu a3, a2, a0
 ; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a0, a2, a0
@@ -92,8 +92,8 @@ define i64 @neg_abs64(i64 %x) {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    srai a2, a1, 31
 ; RV32ZBB-NEXT:    xor a0, a0, a2
-; RV32ZBB-NEXT:    sltu a3, a2, a0
 ; RV32ZBB-NEXT:    xor a1, a1, a2
+; RV32ZBB-NEXT:    sltu a3, a2, a0
 ; RV32ZBB-NEXT:    sub a1, a2, a1
 ; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    sub a0, a2, a0
@@ -121,8 +121,8 @@ define i64 @select_neg_abs64(i64 %x) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srai a2, a1, 31
 ; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    sltu a3, a2, a0
 ; RV32I-NEXT:    xor a1, a1, a2
+; RV32I-NEXT:    sltu a3, a2, a0
 ; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    sub a1, a1, a3
 ; RV32I-NEXT:    sub a0, a2, a0
@@ -132,8 +132,8 @@ define i64 @select_neg_abs64(i64 %x) {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    srai a2, a1, 31
 ; RV32ZBB-NEXT:    xor a0, a0, a2
-; RV32ZBB-NEXT:    sltu a3, a2, a0
 ; RV32ZBB-NEXT:    xor a1, a1, a2
+; RV32ZBB-NEXT:    sltu a3, a2, a0
 ; RV32ZBB-NEXT:    sub a1, a2, a1
 ; RV32ZBB-NEXT:    sub a1, a1, a3
 ; RV32ZBB-NEXT:    sub a0, a2, a0

diff  --git a/llvm/test/CodeGen/RISCV/or-is-add.ll b/llvm/test/CodeGen/RISCV/or-is-add.ll
index 36a201d2776752..73561675b17ecf 100644
--- a/llvm/test/CodeGen/RISCV/or-is-add.ll
+++ b/llvm/test/CodeGen/RISCV/or-is-add.ll
@@ -58,8 +58,8 @@ define i64 @test4(i64 %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    srli a2, a0, 28
 ; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    or a1, a1, a2
 ; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    or a1, a1, a2
 ; RV32-NEXT:    addi a0, a0, 13
 ; RV32-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
index 95b106f4d35ba0..5a01d43fea56ba 100644
--- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll
@@ -693,13 +693,12 @@ define i1 @uaddo_i64_decrement_alt(i64 %x, ptr %p) {
 ; RV32-LABEL: uaddo_i64_decrement_alt:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    or a3, a0, a1
-; RV32-NEXT:    snez a3, a3
 ; RV32-NEXT:    seqz a4, a0
+; RV32-NEXT:    addi a5, a0, -1
+; RV32-NEXT:    snez a0, a3
 ; RV32-NEXT:    sub a1, a1, a4
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    sw a5, 0(a2)
 ; RV32-NEXT:    sw a1, 4(a2)
-; RV32-NEXT:    mv a0, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: uaddo_i64_decrement_alt:
@@ -721,13 +720,12 @@ define i1 @uaddo_i64_decrement_alt_dom(i64 %x, ptr %p) {
 ; RV32-LABEL: uaddo_i64_decrement_alt_dom:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    or a3, a0, a1
-; RV32-NEXT:    snez a3, a3
 ; RV32-NEXT:    seqz a4, a0
+; RV32-NEXT:    addi a5, a0, -1
+; RV32-NEXT:    snez a0, a3
 ; RV32-NEXT:    sub a1, a1, a4
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    sw a5, 0(a2)
 ; RV32-NEXT:    sw a1, 4(a2)
-; RV32-NEXT:    mv a0, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: uaddo_i64_decrement_alt_dom:
@@ -800,10 +798,10 @@ define i1 @usubo_ult_i64_math_overflow_used(i64 %x, i64 %y, ptr %p) {
 ; RV32-NEXT:    mv a5, a0
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    sub a6, a1, a3
-; RV32-NEXT:    sub a6, a6, a0
 ; RV32-NEXT:    sub a5, a5, a2
+; RV32-NEXT:    sub a2, a6, a0
 ; RV32-NEXT:    sw a5, 0(a4)
-; RV32-NEXT:    sw a6, 4(a4)
+; RV32-NEXT:    sw a2, 4(a4)
 ; RV32-NEXT:    beq a1, a3, .LBB23_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    sltu a0, a1, a3
@@ -882,8 +880,8 @@ define i1 @usubo_ult_constant_op0_i16(i16 %x, ptr %p) {
 ; RV32-LABEL: usubo_ult_constant_op0_i16:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a2, a0, 16
-; RV32-NEXT:    srli a2, a2, 16
 ; RV32-NEXT:    li a3, 43
+; RV32-NEXT:    srli a2, a2, 16
 ; RV32-NEXT:    sub a3, a3, a0
 ; RV32-NEXT:    sltiu a0, a2, 44
 ; RV32-NEXT:    xori a0, a0, 1
@@ -893,8 +891,8 @@ define i1 @usubo_ult_constant_op0_i16(i16 %x, ptr %p) {
 ; RV64-LABEL: usubo_ult_constant_op0_i16:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    slli a2, a0, 48
-; RV64-NEXT:    srli a2, a2, 48
 ; RV64-NEXT:    li a3, 43
+; RV64-NEXT:    srli a2, a2, 48
 ; RV64-NEXT:    subw a3, a3, a0
 ; RV64-NEXT:    sltiu a0, a2, 44
 ; RV64-NEXT:    xori a0, a0, 1
@@ -1015,10 +1013,10 @@ define i1 @usubo_ult_sub_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) {
 ; RV32-NEXT:    mv a7, a0
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    sub t0, a1, a3
-; RV32-NEXT:    sub t0, t0, a0
 ; RV32-NEXT:    sub a2, a7, a2
+; RV32-NEXT:    sub a7, t0, a0
 ; RV32-NEXT:    sw a2, 0(a4)
-; RV32-NEXT:    sw t0, 4(a4)
+; RV32-NEXT:    sw a7, 4(a4)
 ; RV32-NEXT:    beqz a6, .LBB31_5
 ; RV32-NEXT:  # %bb.2: # %end
 ; RV32-NEXT:    beq a1, a3, .LBB31_4
@@ -1081,18 +1079,18 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) {
 ; RV32-NEXT:    .cfi_offset s4, -24
 ; RV32-NEXT:    .cfi_offset s5, -28
 ; RV32-NEXT:    .cfi_offset s6, -32
-; RV32-NEXT:    mv s2, a5
-; RV32-NEXT:    andi a5, a5, 1
-; RV32-NEXT:    beqz a5, .LBB32_8
+; RV32-NEXT:    mv s5, a5
+; RV32-NEXT:    mv s3, a1
+; RV32-NEXT:    andi a1, a5, 1
+; RV32-NEXT:    beqz a1, .LBB32_8
 ; RV32-NEXT:  # %bb.1: # %t
 ; RV32-NEXT:    mv s0, a4
-; RV32-NEXT:    mv s3, a3
+; RV32-NEXT:    mv s2, a3
 ; RV32-NEXT:    mv s1, a2
-; RV32-NEXT:    mv s5, a1
 ; RV32-NEXT:    mv s4, a0
-; RV32-NEXT:    beq a1, a3, .LBB32_3
+; RV32-NEXT:    beq s3, a3, .LBB32_3
 ; RV32-NEXT:  # %bb.2: # %t
-; RV32-NEXT:    sltu s6, s5, s3
+; RV32-NEXT:    sltu s6, s3, s2
 ; RV32-NEXT:    j .LBB32_4
 ; RV32-NEXT:  .LBB32_3:
 ; RV32-NEXT:    sltu s6, s4, s1
@@ -1103,18 +1101,18 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) {
 ; RV32-NEXT:  # %bb.5: # %end
 ; RV32-NEXT:    sltu a1, s4, s1
 ; RV32-NEXT:    mv a0, a1
-; RV32-NEXT:    beq s5, s3, .LBB32_7
+; RV32-NEXT:    beq s3, s2, .LBB32_7
 ; RV32-NEXT:  # %bb.6: # %end
-; RV32-NEXT:    sltu a0, s5, s3
+; RV32-NEXT:    sltu a0, s3, s2
 ; RV32-NEXT:  .LBB32_7: # %end
-; RV32-NEXT:    sub a2, s5, s3
+; RV32-NEXT:    sub a2, s3, s2
+; RV32-NEXT:    sub a3, s4, s1
 ; RV32-NEXT:    sub a2, a2, a1
-; RV32-NEXT:    sub a1, s4, s1
-; RV32-NEXT:    sw a1, 0(s0)
+; RV32-NEXT:    sw a3, 0(s0)
 ; RV32-NEXT:    sw a2, 4(s0)
 ; RV32-NEXT:    j .LBB32_9
 ; RV32-NEXT:  .LBB32_8: # %f
-; RV32-NEXT:    mv a0, s2
+; RV32-NEXT:    mv a0, s5
 ; RV32-NEXT:  .LBB32_9: # %f
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
@@ -1153,13 +1151,13 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) {
 ; RV64-NEXT:    .cfi_offset s3, -40
 ; RV64-NEXT:    .cfi_offset s4, -48
 ; RV64-NEXT:    mv s0, a3
-; RV64-NEXT:    andi a3, a3, 1
-; RV64-NEXT:    beqz a3, .LBB32_3
+; RV64-NEXT:    mv s2, a1
+; RV64-NEXT:    andi a1, a3, 1
+; RV64-NEXT:    beqz a1, .LBB32_3
 ; RV64-NEXT:  # %bb.1: # %t
 ; RV64-NEXT:    mv s1, a2
-; RV64-NEXT:    mv s2, a1
 ; RV64-NEXT:    mv s3, a0
-; RV64-NEXT:    sltu s4, a0, a1
+; RV64-NEXT:    sltu s4, a0, s2
 ; RV64-NEXT:    mv a0, s4
 ; RV64-NEXT:    call call
 ; RV64-NEXT:    bgeu s3, s2, .LBB32_3
@@ -1275,8 +1273,8 @@ define void @PR41129(ptr %p64) {
 ; RV32-NEXT:    ret
 ; RV32-NEXT:  .LBB37_2: # %true
 ; RV32-NEXT:    seqz a3, a1
-; RV32-NEXT:    sub a2, a2, a3
 ; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    sub a2, a2, a3
 ; RV32-NEXT:    sw a1, 0(a0)
 ; RV32-NEXT:    sw a2, 4(a0)
 ; RV32-NEXT:    ret
@@ -1316,9 +1314,9 @@ define i16 @overflow_not_used(i16 %a, i16 %b, ptr %res) {
 ; RV32-LABEL: overflow_not_used:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a4, a1, a3
-; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    and a3, a0, a3
 ; RV32-NEXT:    bltu a3, a4, .LBB38_2
 ; RV32-NEXT:  # %bb.1:
@@ -1331,9 +1329,9 @@ define i16 @overflow_not_used(i16 %a, i16 %b, ptr %res) {
 ; RV64-LABEL: overflow_not_used:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a3, 16
+; RV64-NEXT:    add a0, a1, a0
 ; RV64-NEXT:    addiw a3, a3, -1
 ; RV64-NEXT:    and a4, a1, a3
-; RV64-NEXT:    add a0, a1, a0
 ; RV64-NEXT:    and a3, a0, a3
 ; RV64-NEXT:    bltu a3, a4, .LBB38_2
 ; RV64-NEXT:  # %bb.1:

diff  --git a/llvm/test/CodeGen/RISCV/pr51206.ll b/llvm/test/CodeGen/RISCV/pr51206.ll
index 8aa145f6ac5efa..8e858bdd29762f 100644
--- a/llvm/test/CodeGen/RISCV/pr51206.ll
+++ b/llvm/test/CodeGen/RISCV/pr51206.ll
@@ -12,21 +12,21 @@ define signext i32 @wobble() nounwind {
 ; CHECK-LABEL: wobble:
 ; CHECK:       # %bb.0: # %bb
 ; CHECK-NEXT:    lui a0, %hi(global)
+; CHECK-NEXT:    lui a1, %hi(global.1)
 ; CHECK-NEXT:    lbu a0, %lo(global)(a0)
-; CHECK-NEXT:    lui a1, %hi(global.2)
-; CHECK-NEXT:    lbu a1, %lo(global.2)(a1)
+; CHECK-NEXT:    lui a2, %hi(global.2)
+; CHECK-NEXT:    lui a3, 52429
+; CHECK-NEXT:    lbu a2, %lo(global.2)(a2)
 ; CHECK-NEXT:    addi a0, a0, 1
-; CHECK-NEXT:    lui a2, %hi(global.1)
-; CHECK-NEXT:    sw a0, %lo(global.1)(a2)
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    slli a1, a0, 48
-; CHECK-NEXT:    lui a2, 52429
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    mulhu a1, a1, a2
-; CHECK-NEXT:    srli a1, a1, 18
-; CHECK-NEXT:    lui a2, %hi(global.3)
+; CHECK-NEXT:    sw a0, %lo(global.1)(a1)
+; CHECK-NEXT:    lui a1, %hi(global.3)
+; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a2, a0, 48
+; CHECK-NEXT:    mulhu a2, a2, a3
+; CHECK-NEXT:    srli a2, a2, 18
 ; CHECK-NEXT:    li a3, 5
-; CHECK-NEXT:    sw a1, %lo(global.3)(a2)
+; CHECK-NEXT:    sw a2, %lo(global.3)(a1)
 ; CHECK-NEXT:    bgeu a0, a3, .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %bb12
 ; CHECK-NEXT:    li a0, 0

diff  --git a/llvm/test/CodeGen/RISCV/pr56457.ll b/llvm/test/CodeGen/RISCV/pr56457.ll
index ba08aa838bf992..cf518b31a190b7 100644
--- a/llvm/test/CodeGen/RISCV/pr56457.ll
+++ b/llvm/test/CodeGen/RISCV/pr56457.ll
@@ -10,41 +10,41 @@ define i15 @foo(i15 %x) nounwind {
 ; CHECK-NEXT:    beqz a1, .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %cond.false
 ; CHECK-NEXT:    srli a1, a1, 50
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    lui a3, 209715
+; CHECK-NEXT:    lui a4, 61681
 ; CHECK-NEXT:    or a0, a0, a1
-; CHECK-NEXT:    slli a1, a0, 49
-; CHECK-NEXT:    srli a1, a1, 51
-; CHECK-NEXT:    or a0, a0, a1
-; CHECK-NEXT:    slli a1, a0, 49
-; CHECK-NEXT:    srli a1, a1, 53
-; CHECK-NEXT:    or a0, a0, a1
-; CHECK-NEXT:    slli a1, a0, 49
-; CHECK-NEXT:    srli a1, a1, 57
-; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    addiw a1, a2, 1365
+; CHECK-NEXT:    addiw a2, a3, 819
+; CHECK-NEXT:    addiw a3, a4, -241
+; CHECK-NEXT:    slli a4, a2, 32
+; CHECK-NEXT:    add a2, a2, a4
+; CHECK-NEXT:    slli a4, a3, 32
+; CHECK-NEXT:    add a3, a3, a4
+; CHECK-NEXT:    slli a4, a0, 49
+; CHECK-NEXT:    srli a4, a4, 51
+; CHECK-NEXT:    or a0, a0, a4
+; CHECK-NEXT:    slli a4, a0, 49
+; CHECK-NEXT:    srli a4, a4, 53
+; CHECK-NEXT:    or a0, a0, a4
+; CHECK-NEXT:    slli a4, a0, 49
+; CHECK-NEXT:    srli a4, a4, 57
+; CHECK-NEXT:    or a0, a0, a4
 ; CHECK-NEXT:    not a0, a0
-; CHECK-NEXT:    srli a1, a0, 1
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    addiw a2, a2, 1365
-; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    srli a4, a0, 1
+; CHECK-NEXT:    and a1, a4, a1
 ; CHECK-NEXT:    slli a0, a0, 49
 ; CHECK-NEXT:    srli a0, a0, 49
 ; CHECK-NEXT:    sub a0, a0, a1
-; CHECK-NEXT:    lui a1, 209715
-; CHECK-NEXT:    addiw a1, a1, 819
-; CHECK-NEXT:    slli a2, a1, 32
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    and a2, a0, a1
+; CHECK-NEXT:    and a1, a0, a2
 ; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    srli a1, a0, 4
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    lui a1, 61681
-; CHECK-NEXT:    addiw a1, a1, -241
-; CHECK-NEXT:    slli a2, a1, 32
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    lui a1, 4112
 ; CHECK-NEXT:    addiw a1, a1, 257
+; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    slli a2, a1, 32
 ; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    mul a0, a0, a1

diff  --git a/llvm/test/CodeGen/RISCV/pr58511.ll b/llvm/test/CodeGen/RISCV/pr58511.ll
index df02d77f613290..e5cba679729fae 100644
--- a/llvm/test/CodeGen/RISCV/pr58511.ll
+++ b/llvm/test/CodeGen/RISCV/pr58511.ll
@@ -5,8 +5,8 @@ define i32 @f(i1 %0, i32 %1, ptr %2) {
 ; CHECK-LABEL: f:
 ; CHECK:       # %bb.0: # %BB
 ; CHECK-NEXT:    slli a0, a0, 63
-; CHECK-NEXT:    srai a0, a0, 63
 ; CHECK-NEXT:    lui a3, 4097
+; CHECK-NEXT:    srai a0, a0, 63
 ; CHECK-NEXT:    addiw a3, a3, -2047
 ; CHECK-NEXT:    or a0, a0, a3
 ; CHECK-NEXT:    mul a1, a1, a3
@@ -24,8 +24,8 @@ define i32 @g(i1 %0, i32 %1, ptr %2) {
 ; CHECK-LABEL: g:
 ; CHECK:       # %bb.0: # %BB
 ; CHECK-NEXT:    andi a0, a0, 1
-; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    lui a3, 4097
+; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    addiw a3, a3, -2047
 ; CHECK-NEXT:    or a0, a0, a3
 ; CHECK-NEXT:    mul a1, a1, a3
@@ -43,10 +43,10 @@ define i32 @h(i1 %0, i32 %1, ptr %2) {
 ; CHECK-LABEL: h:
 ; CHECK:       # %bb.0: # %BB
 ; CHECK-NEXT:    lui a3, 4097
-; CHECK-NEXT:    addiw a3, a3, -2047
-; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    slli a0, a0, 63
+; CHECK-NEXT:    addiw a3, a3, -2047
 ; CHECK-NEXT:    srai a0, a0, 63
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    sw a1, 0(a2)
 ; CHECK-NEXT:    ret
@@ -64,8 +64,8 @@ define i32 @i(i1 %0, i32 %1, ptr %2) {
 ; CHECK-NEXT:    andi a0, a0, 1
 ; CHECK-NEXT:    lui a3, 4097
 ; CHECK-NEXT:    addiw a3, a3, -2047
-; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    and a0, a0, a3
 ; CHECK-NEXT:    sw a1, 0(a2)
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/pr65025.ll b/llvm/test/CodeGen/RISCV/pr65025.ll
index dcd71edc460b8d..c6770b05da555e 100644
--- a/llvm/test/CodeGen/RISCV/pr65025.ll
+++ b/llvm/test/CodeGen/RISCV/pr65025.ll
@@ -7,10 +7,10 @@ define ptr @cmpxchg_masked_and_branch1(ptr %ptr, i8 signext %cmp, i8 signext %va
 ; CHECK-NEXT:    andi a3, a0, -4
 ; CHECK-NEXT:    slli a4, a0, 3
 ; CHECK-NEXT:    li a5, 255
-; CHECK-NEXT:    sllw a5, a5, a4
 ; CHECK-NEXT:    andi a1, a1, 255
-; CHECK-NEXT:    sllw a1, a1, a4
 ; CHECK-NEXT:    andi a2, a2, 255
+; CHECK-NEXT:    sllw a5, a5, a4
+; CHECK-NEXT:    sllw a1, a1, a4
 ; CHECK-NEXT:    sllw a2, a2, a4
 ; CHECK-NEXT:  .LBB0_3: # %do_cmpxchg
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1

diff  --git a/llvm/test/CodeGen/RISCV/pr68855.ll b/llvm/test/CodeGen/RISCV/pr68855.ll
index e9d1f6c2d1b2cc..8031bf4f30411b 100644
--- a/llvm/test/CodeGen/RISCV/pr68855.ll
+++ b/llvm/test/CodeGen/RISCV/pr68855.ll
@@ -6,10 +6,10 @@ define i16 @narrow_load(ptr %p1, ptr %p2) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lhu a2, 0(a0)
 ; CHECK-NEXT:    lui a3, 2
-; CHECK-NEXT:    addiw a3, a3, -1
-; CHECK-NEXT:    xor a2, a2, a3
 ; CHECK-NEXT:    lui a4, 16
+; CHECK-NEXT:    addiw a3, a3, -1
 ; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    xor a2, a2, a3
 ; CHECK-NEXT:    xor a4, a3, a4
 ; CHECK-NEXT:    or a2, a2, a4
 ; CHECK-NEXT:    sw a2, 0(a1)

diff  --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll
index 69746e3e70bfcf..9fc9a3c42867e7 100644
--- a/llvm/test/CodeGen/RISCV/pr69586.ll
+++ b/llvm/test/CodeGen/RISCV/pr69586.ll
@@ -7,21 +7,21 @@
 define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-LABEL: test:
 ; NOREMAT:       # %bb.0:
-; NOREMAT-NEXT:    addi sp, sp, -400
-; NOREMAT-NEXT:    .cfi_def_cfa_offset 400
-; NOREMAT-NEXT:    sd ra, 392(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s0, 384(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s1, 376(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s2, 368(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s3, 360(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s4, 352(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s5, 344(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s6, 336(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s7, 328(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s8, 320(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s9, 312(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s10, 304(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s11, 296(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addi sp, sp, -752
+; NOREMAT-NEXT:    .cfi_def_cfa_offset 752
+; NOREMAT-NEXT:    sd ra, 744(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s0, 736(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s1, 728(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s2, 720(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s3, 712(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s4, 704(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s5, 696(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s6, 688(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s7, 680(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s8, 672(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s9, 664(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s10, 656(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s11, 648(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    .cfi_offset ra, -8
 ; NOREMAT-NEXT:    .cfi_offset s0, -16
 ; NOREMAT-NEXT:    .cfi_offset s1, -24
@@ -36,747 +36,845 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    .cfi_offset s10, -96
 ; NOREMAT-NEXT:    .cfi_offset s11, -104
 ; NOREMAT-NEXT:    csrr a2, vlenb
-; NOREMAT-NEXT:    li a3, 6
-; NOREMAT-NEXT:    mul a2, a2, a3
+; NOREMAT-NEXT:    slli a2, a2, 1
 ; NOREMAT-NEXT:    sub sp, sp, a2
-; NOREMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x90, 0x03, 0x22, 0x11, 0x06, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 400 + 6 * vlenb
-; NOREMAT-NEXT:    li a2, 32
-; NOREMAT-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
-; NOREMAT-NEXT:    vle32.v v8, (a0)
-; NOREMAT-NEXT:    addi a2, a0, 512
-; NOREMAT-NEXT:    vle32.v v10, (a2)
-; NOREMAT-NEXT:    addi a2, a0, 1024
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    addi a2, a0, 1536
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
-; NOREMAT-NEXT:    li a2, 1
-; NOREMAT-NEXT:    slli a2, a2, 11
-; NOREMAT-NEXT:    sd a2, 272(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    li a5, 5
-; NOREMAT-NEXT:    slli a2, a5, 9
-; NOREMAT-NEXT:    sd a2, 264(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
-; NOREMAT-NEXT:    li a2, 3
-; NOREMAT-NEXT:    slli a3, a2, 10
-; NOREMAT-NEXT:    sd a3, 256(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
-; NOREMAT-NEXT:    vle32.v v12, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a3)
-; NOREMAT-NEXT:    li a4, 7
-; NOREMAT-NEXT:    slli a3, a4, 9
-; NOREMAT-NEXT:    sd a3, 248(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
-; NOREMAT-NEXT:    vle32.v v14, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a3)
-; NOREMAT-NEXT:    lui a3, 1
-; NOREMAT-NEXT:    add a3, a0, a3
-; NOREMAT-NEXT:    vle32.v v12, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a3)
-; NOREMAT-NEXT:    li a3, 9
-; NOREMAT-NEXT:    slli a6, a3, 9
-; NOREMAT-NEXT:    sd a6, 240(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a6, a0, a6
-; NOREMAT-NEXT:    vle32.v v14, (a6)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a6)
-; NOREMAT-NEXT:    slli a6, a5, 10
-; NOREMAT-NEXT:    sd a6, 232(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a6, a0, a6
-; NOREMAT-NEXT:    vle32.v v12, (a6)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a6)
-; NOREMAT-NEXT:    li s8, 11
-; NOREMAT-NEXT:    slli a6, s8, 9
-; NOREMAT-NEXT:    sd a6, 224(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a6, a0, a6
-; NOREMAT-NEXT:    vle32.v v14, (a6)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a6)
-; NOREMAT-NEXT:    slli a2, a2, 11
-; NOREMAT-NEXT:    sd a2, 216(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    li s2, 13
-; NOREMAT-NEXT:    slli a2, s2, 9
-; NOREMAT-NEXT:    sd a2, 208(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
-; NOREMAT-NEXT:    slli a2, a4, 10
-; NOREMAT-NEXT:    sd a2, 200(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    li a2, 15
-; NOREMAT-NEXT:    slli a6, a2, 9
-; NOREMAT-NEXT:    sd a6, 192(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a6, a0, a6
-; NOREMAT-NEXT:    vle32.v v26, (a6)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v16, (a6)
-; NOREMAT-NEXT:    lui a6, 2
-; NOREMAT-NEXT:    add a6, a0, a6
-; NOREMAT-NEXT:    vle32.v v28, (a6)
-; NOREMAT-NEXT:    vle32.v v10, (a6)
-; NOREMAT-NEXT:    li a6, 17
-; NOREMAT-NEXT:    slli a6, a6, 9
-; NOREMAT-NEXT:    sd a6, 184(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    li t0, 17
-; NOREMAT-NEXT:    add a6, a0, a6
-; NOREMAT-NEXT:    vle32.v v30, (a6)
-; NOREMAT-NEXT:    vle32.v v18, (a6)
-; NOREMAT-NEXT:    slli a6, a3, 10
-; NOREMAT-NEXT:    sd a6, 176(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a6, a0, a6
-; NOREMAT-NEXT:    vle32.v v6, (a6)
-; NOREMAT-NEXT:    vle32.v v20, (a6)
-; NOREMAT-NEXT:    li a6, 19
-; NOREMAT-NEXT:    slli a6, a6, 9
-; NOREMAT-NEXT:    sd a6, 168(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    li a7, 19
-; NOREMAT-NEXT:    add a6, a0, a6
-; NOREMAT-NEXT:    vle32.v v4, (a6)
-; NOREMAT-NEXT:    vle32.v v22, (a6)
-; NOREMAT-NEXT:    slli a5, a5, 11
-; NOREMAT-NEXT:    sd a5, 160(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v2, (a5)
-; NOREMAT-NEXT:    vle32.v v12, (a5)
-; NOREMAT-NEXT:    li s10, 21
-; NOREMAT-NEXT:    slli a5, s10, 9
-; NOREMAT-NEXT:    sd a5, 152(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v24, (a5)
-; NOREMAT-NEXT:    vle32.v v14, (a5)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
-; NOREMAT-NEXT:    slli a5, s8, 10
-; NOREMAT-NEXT:    sd a5, 144(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v26, (a5)
+; NOREMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x05, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 752 + 2 * vlenb
+; NOREMAT-NEXT:    mv a7, a0
+; NOREMAT-NEXT:    li a0, 32
+; NOREMAT-NEXT:    addi a5, a7, 512
+; NOREMAT-NEXT:    addi a4, a7, 1024
+; NOREMAT-NEXT:    addi a6, a7, 1536
+; NOREMAT-NEXT:    li t4, 1
+; NOREMAT-NEXT:    li a2, 5
+; NOREMAT-NEXT:    li t1, 3
+; NOREMAT-NEXT:    li t0, 7
+; NOREMAT-NEXT:    lui t5, 1
+; NOREMAT-NEXT:    li s4, 9
+; NOREMAT-NEXT:    li s6, 11
+; NOREMAT-NEXT:    li s9, 13
+; NOREMAT-NEXT:    li ra, 15
+; NOREMAT-NEXT:    lui t2, 2
+; NOREMAT-NEXT:    lui s1, 3
+; NOREMAT-NEXT:    lui t3, 4
+; NOREMAT-NEXT:    lui s0, 5
+; NOREMAT-NEXT:    lui s3, 6
+; NOREMAT-NEXT:    lui s7, 7
+; NOREMAT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOREMAT-NEXT:    slli t4, t4, 11
+; NOREMAT-NEXT:    sd t4, 512(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    slli a3, a2, 9
+; NOREMAT-NEXT:    sd a3, 504(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    slli t6, t1, 10
+; NOREMAT-NEXT:    slli s2, t0, 9
+; NOREMAT-NEXT:    add a0, a7, t5
+; NOREMAT-NEXT:    lui s11, 1
+; NOREMAT-NEXT:    slli s4, s4, 9
+; NOREMAT-NEXT:    slli s5, a2, 10
+; NOREMAT-NEXT:    slli s6, s6, 9
+; NOREMAT-NEXT:    slli s8, t1, 11
 ; NOREMAT-NEXT:    vle32.v v8, (a5)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v28
-; NOREMAT-NEXT:    li s6, 23
-; NOREMAT-NEXT:    slli a5, s6, 9
-; NOREMAT-NEXT:    sd a5, 136(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v28, (a5)
-; NOREMAT-NEXT:    vle32.v v16, (a5)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v30
-; NOREMAT-NEXT:    lui a5, 3
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v30, (a5)
-; NOREMAT-NEXT:    vle32.v v10, (a5)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v18, v6
-; NOREMAT-NEXT:    li s3, 25
-; NOREMAT-NEXT:    slli a5, s3, 9
-; NOREMAT-NEXT:    sd a5, 128(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v6, (a5)
-; NOREMAT-NEXT:    vle32.v v18, (a5)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v4
-; NOREMAT-NEXT:    slli a5, s2, 10
-; NOREMAT-NEXT:    sd a5, 120(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v4, (a5)
-; NOREMAT-NEXT:    vle32.v v20, (a5)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v2
-; NOREMAT-NEXT:    li t5, 27
-; NOREMAT-NEXT:    slli a5, t5, 9
-; NOREMAT-NEXT:    sd a5, 112(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v2, (a5)
-; NOREMAT-NEXT:    vle32.v v22, (a5)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v24
-; NOREMAT-NEXT:    slli a4, a4, 11
-; NOREMAT-NEXT:    sd a4, 104(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
-; NOREMAT-NEXT:    vle32.v v24, (a4)
-; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v26
-; NOREMAT-NEXT:    li t2, 29
-; NOREMAT-NEXT:    slli a4, t2, 9
-; NOREMAT-NEXT:    sd a4, 96(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
+; NOREMAT-NEXT:    slli s9, s9, 9
+; NOREMAT-NEXT:    li t5, 13
+; NOREMAT-NEXT:    vle32.v v10, (a4)
+; NOREMAT-NEXT:    vle32.v v2, (a4)
+; NOREMAT-NEXT:    slli s10, t0, 10
+; NOREMAT-NEXT:    vle32.v v0, (a6)
+; NOREMAT-NEXT:    vle32.v v12, (a6)
+; NOREMAT-NEXT:    slli ra, ra, 9
+; NOREMAT-NEXT:    vle32.v v4, (a0)
+; NOREMAT-NEXT:    vle32.v v20, (a0)
+; NOREMAT-NEXT:    add a4, a7, t2
+; NOREMAT-NEXT:    vle32.v v6, (a4)
+; NOREMAT-NEXT:    vle32.v v30, (a4)
+; NOREMAT-NEXT:    add a4, a7, s1
+; NOREMAT-NEXT:    vle32.v v28, (a4)
 ; NOREMAT-NEXT:    vle32.v v26, (a4)
+; NOREMAT-NEXT:    add a4, a7, t3
+; NOREMAT-NEXT:    vle32.v v24, (a4)
+; NOREMAT-NEXT:    vle32.v v22, (a4)
+; NOREMAT-NEXT:    add a4, a7, s0
+; NOREMAT-NEXT:    vle32.v v14, (a7)
+; NOREMAT-NEXT:    vle32.v v18, (a4)
+; NOREMAT-NEXT:    vle32.v v16, (a4)
+; NOREMAT-NEXT:    add a4, a7, s3
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v8
 ; NOREMAT-NEXT:    vle32.v v14, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v28
-; NOREMAT-NEXT:    slli a4, a2, 10
-; NOREMAT-NEXT:    sd a4, 88(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
-; NOREMAT-NEXT:    vle32.v v28, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    csrr a4, vlenb
-; NOREMAT-NEXT:    slli a4, a4, 2
-; NOREMAT-NEXT:    add a4, sp, a4
-; NOREMAT-NEXT:    addi a4, a4, 288
-; NOREMAT-NEXT:    vs2r.v v8, (a4) # Unknown-size Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v30
-; NOREMAT-NEXT:    li a5, 31
-; NOREMAT-NEXT:    slli a4, a5, 9
-; NOREMAT-NEXT:    sd a4, 80(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
-; NOREMAT-NEXT:    vle32.v v30, (a4)
-; NOREMAT-NEXT:    vle32.v v16, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v6
-; NOREMAT-NEXT:    lui a6, 4
-; NOREMAT-NEXT:    add a4, a0, a6
-; NOREMAT-NEXT:    vle32.v v6, (a4)
+; NOREMAT-NEXT:    addi a0, sp, 640
+; NOREMAT-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; NOREMAT-NEXT:    add a4, a7, t4
+; NOREMAT-NEXT:    vle32.v v10, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
+; NOREMAT-NEXT:    vle32.v v2, (a4)
+; NOREMAT-NEXT:    add a4, a7, a3
+; NOREMAT-NEXT:    vle32.v v0, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v10
+; NOREMAT-NEXT:    vle32.v v10, (a4)
+; NOREMAT-NEXT:    add a4, a7, t6
+; NOREMAT-NEXT:    vle32.v v12, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
+; NOREMAT-NEXT:    vle32.v v2, (a4)
+; NOREMAT-NEXT:    add a4, a7, s2
+; NOREMAT-NEXT:    vle32.v v8, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
+; NOREMAT-NEXT:    vle32.v v12, (a4)
+; NOREMAT-NEXT:    add a4, a7, s7
+; NOREMAT-NEXT:    vle32.v v0, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v8
+; NOREMAT-NEXT:    vle32.v v10, (a4)
+; NOREMAT-NEXT:    add a4, a7, s4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    csrr a4, vlenb
-; NOREMAT-NEXT:    slli a4, a4, 1
-; NOREMAT-NEXT:    add a4, sp, a4
-; NOREMAT-NEXT:    addi a4, a4, 288
-; NOREMAT-NEXT:    vs2r.v v8, (a4) # Unknown-size Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v18, v4
-; NOREMAT-NEXT:    addiw a4, a6, 512
-; NOREMAT-NEXT:    sd a4, 72(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
+; NOREMAT-NEXT:    vle32.v v12, (a4)
+; NOREMAT-NEXT:    add a4, a7, s5
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
-; NOREMAT-NEXT:    vle32.v v18, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v2
-; NOREMAT-NEXT:    slli a4, t0, 10
-; NOREMAT-NEXT:    sd a4, 64(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
-; NOREMAT-NEXT:    vle32.v v2, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v8
+; NOREMAT-NEXT:    vle32.v v8, (a4)
+; NOREMAT-NEXT:    add a4, a7, s6
 ; NOREMAT-NEXT:    vle32.v v20, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
-; NOREMAT-NEXT:    addiw a4, a6, 1536
-; NOREMAT-NEXT:    sd a4, 56(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
-; NOREMAT-NEXT:    vle32.v v0, (a4)
-; NOREMAT-NEXT:    vle32.v v22, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v26
-; NOREMAT-NEXT:    slli a3, a3, 11
-; NOREMAT-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
+; NOREMAT-NEXT:    vle32.v v12, (a4)
+; NOREMAT-NEXT:    add a4, a7, s8
+; NOREMAT-NEXT:    vle32.v v4, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v20
+; NOREMAT-NEXT:    vle32.v v8, (a4)
+; NOREMAT-NEXT:    add a4, a7, s9
+; NOREMAT-NEXT:    vle32.v v20, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
+; NOREMAT-NEXT:    vle32.v v12, (a4)
+; NOREMAT-NEXT:    add a4, a7, s10
+; NOREMAT-NEXT:    vle32.v v4, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v20
+; NOREMAT-NEXT:    vle32.v v8, (a4)
+; NOREMAT-NEXT:    add a4, a7, ra
+; NOREMAT-NEXT:    vle32.v v2, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
+; NOREMAT-NEXT:    lui t4, 8
+; NOREMAT-NEXT:    add a5, a7, t4
+; NOREMAT-NEXT:    vle32.v v20, (a5)
+; NOREMAT-NEXT:    vle32.v v12, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v2
+; NOREMAT-NEXT:    li a4, 17
+; NOREMAT-NEXT:    slli a4, a4, 9
+; NOREMAT-NEXT:    li s1, 17
+; NOREMAT-NEXT:    sd a4, 624(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    vle32.v v8, (a4)
+; NOREMAT-NEXT:    vle32.v v4, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v6
+; NOREMAT-NEXT:    li a5, 9
+; NOREMAT-NEXT:    slli a4, a5, 10
+; NOREMAT-NEXT:    sd a4, 616(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    vle32.v v12, (a4)
+; NOREMAT-NEXT:    vle32.v v6, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
+; NOREMAT-NEXT:    li a4, 19
+; NOREMAT-NEXT:    slli a4, a4, 9
+; NOREMAT-NEXT:    li t2, 19
+; NOREMAT-NEXT:    sd a4, 608(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    vle32.v v8, (a4)
+; NOREMAT-NEXT:    vle32.v v30, (a4)
+; NOREMAT-NEXT:    slli a3, a2, 11
+; NOREMAT-NEXT:    sd a3, 600(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v12
+; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
+; NOREMAT-NEXT:    vle32.v v4, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
+; NOREMAT-NEXT:    li s7, 21
+; NOREMAT-NEXT:    slli a3, s7, 9
+; NOREMAT-NEXT:    sd a3, 592(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
-; NOREMAT-NEXT:    addi a3, sp, 288
-; NOREMAT-NEXT:    vs2r.v v8, (a3) # Unknown-size Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v28
-; NOREMAT-NEXT:    lui s1, 5
-; NOREMAT-NEXT:    addiw a3, s1, -1536
-; NOREMAT-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
-; NOREMAT-NEXT:    vle32.v v8, (a3)
-; NOREMAT-NEXT:    vle32.v v24, (a3)
-; NOREMAT-NEXT:    csrr a3, vlenb
-; NOREMAT-NEXT:    slli a3, a3, 2
-; NOREMAT-NEXT:    add a3, sp, a3
-; NOREMAT-NEXT:    addi a3, a3, 288
-; NOREMAT-NEXT:    vl2r.v v10, (a3) # Unknown-size Folded Reload
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v30
-; NOREMAT-NEXT:    slli a3, a7, 10
-; NOREMAT-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
-; NOREMAT-NEXT:    vle32.v v10, (a3)
-; NOREMAT-NEXT:    vle32.v v14, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v6
-; NOREMAT-NEXT:    addiw a3, s1, -512
-; NOREMAT-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v6, (a3)
-; NOREMAT-NEXT:    vle32.v v16, (a3)
-; NOREMAT-NEXT:    csrr a3, vlenb
-; NOREMAT-NEXT:    slli a3, a3, 1
-; NOREMAT-NEXT:    add a3, sp, a3
-; NOREMAT-NEXT:    addi a3, a3, 288
-; NOREMAT-NEXT:    vl2r.v v26, (a3) # Unknown-size Folded Reload
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v4
-; NOREMAT-NEXT:    add a3, a0, s1
-; NOREMAT-NEXT:    vle32.v v26, (a3)
-; NOREMAT-NEXT:    vle32.v v28, (a3)
-; NOREMAT-NEXT:    csrr a3, vlenb
-; NOREMAT-NEXT:    slli a3, a3, 2
-; NOREMAT-NEXT:    add a3, sp, a3
-; NOREMAT-NEXT:    addi a3, a3, 288
-; NOREMAT-NEXT:    vs2r.v v28, (a3) # Unknown-size Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
-; NOREMAT-NEXT:    addiw ra, s1, 512
-; NOREMAT-NEXT:    add a3, a0, ra
-; NOREMAT-NEXT:    vle32.v v28, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
+; NOREMAT-NEXT:    li a6, 11
+; NOREMAT-NEXT:    slli a3, a6, 10
+; NOREMAT-NEXT:    sd a3, 584(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v30, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
-; NOREMAT-NEXT:    slli s11, s10, 10
-; NOREMAT-NEXT:    add a3, a0, s11
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
+; NOREMAT-NEXT:    li s3, 23
+; NOREMAT-NEXT:    slli a3, s3, 9
+; NOREMAT-NEXT:    sd a3, 576(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
-; NOREMAT-NEXT:    vle32.v v18, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v12
-; NOREMAT-NEXT:    addiw s10, s1, 1536
-; NOREMAT-NEXT:    add a3, a0, s10
-; NOREMAT-NEXT:    vle32.v v2, (a3)
-; NOREMAT-NEXT:    vle32.v v20, (a3)
-; NOREMAT-NEXT:    addi a3, sp, 288
-; NOREMAT-NEXT:    vl2r.v v12, (a3) # Unknown-size Folded Reload
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v8
-; NOREMAT-NEXT:    slli s9, s8, 11
-; NOREMAT-NEXT:    add a3, a0, s9
-; NOREMAT-NEXT:    vle32.v v0, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
+; NOREMAT-NEXT:    li s0, 25
+; NOREMAT-NEXT:    slli a3, s0, 9
+; NOREMAT-NEXT:    sd a3, 568(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a7, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v10
-; NOREMAT-NEXT:    lui t0, 6
-; NOREMAT-NEXT:    addiw s8, t0, -1536
-; NOREMAT-NEXT:    add a3, a0, s8
-; NOREMAT-NEXT:    vle32.v v8, (a3)
-; NOREMAT-NEXT:    vle32.v v22, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
-; NOREMAT-NEXT:    slli s7, s6, 10
-; NOREMAT-NEXT:    add a3, a0, s7
-; NOREMAT-NEXT:    vle32.v v10, (a3)
-; NOREMAT-NEXT:    vle32.v v14, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v26
-; NOREMAT-NEXT:    addiw s6, t0, -512
-; NOREMAT-NEXT:    add a3, a0, s6
 ; NOREMAT-NEXT:    vle32.v v6, (a3)
-; NOREMAT-NEXT:    vle32.v v16, (a3)
-; NOREMAT-NEXT:    csrr a3, vlenb
-; NOREMAT-NEXT:    slli a3, a3, 2
-; NOREMAT-NEXT:    add a3, sp, a3
-; NOREMAT-NEXT:    addi a3, a3, 288
-; NOREMAT-NEXT:    vl2r.v v24, (a3) # Unknown-size Folded Reload
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v28
-; NOREMAT-NEXT:    add a3, a0, t0
-; NOREMAT-NEXT:    vle32.v v24, (a3)
-; NOREMAT-NEXT:    vle32.v v26, (a3)
-; NOREMAT-NEXT:    csrr a3, vlenb
-; NOREMAT-NEXT:    slli a3, a3, 2
-; NOREMAT-NEXT:    add a3, sp, a3
-; NOREMAT-NEXT:    addi a3, a3, 288
-; NOREMAT-NEXT:    vs2r.v v26, (a3) # Unknown-size Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v4
-; NOREMAT-NEXT:    addiw s5, t0, 512
-; NOREMAT-NEXT:    add a3, a0, s5
-; NOREMAT-NEXT:    vle32.v v26, (a3)
-; NOREMAT-NEXT:    vle32.v v28, (a3)
-; NOREMAT-NEXT:    csrr a3, vlenb
-; NOREMAT-NEXT:    slli a3, a3, 1
-; NOREMAT-NEXT:    add a3, sp, a3
-; NOREMAT-NEXT:    addi a3, a3, 288
-; NOREMAT-NEXT:    vs2r.v v28, (a3) # Unknown-size Folded Spill
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
-; NOREMAT-NEXT:    slli s4, s3, 10
-; NOREMAT-NEXT:    add a3, a0, s4
-; NOREMAT-NEXT:    vle32.v v28, (a3)
-; NOREMAT-NEXT:    vle32.v v18, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
-; NOREMAT-NEXT:    addiw s3, t0, 1536
-; NOREMAT-NEXT:    add a3, a0, s3
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
+; NOREMAT-NEXT:    slli a3, t5, 10
+; NOREMAT-NEXT:    sd a3, 560(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v30, (a3)
-; NOREMAT-NEXT:    vle32.v v20, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v8
-; NOREMAT-NEXT:    slli s2, s2, 11
-; NOREMAT-NEXT:    add a3, a0, s2
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v28
+; NOREMAT-NEXT:    li t3, 27
+; NOREMAT-NEXT:    slli a3, t3, 9
+; NOREMAT-NEXT:    sd a3, 552(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    vle32.v v28, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
-; NOREMAT-NEXT:    vle32.v v12, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v10
-; NOREMAT-NEXT:    lui a3, 7
-; NOREMAT-NEXT:    addiw s0, a3, -1536
-; NOREMAT-NEXT:    add a4, a0, s0
-; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    vle32.v v22, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
-; NOREMAT-NEXT:    slli t6, t5, 10
-; NOREMAT-NEXT:    add a4, a0, t6
-; NOREMAT-NEXT:    vle32.v v0, (a4)
-; NOREMAT-NEXT:    vle32.v v14, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v24
-; NOREMAT-NEXT:    addiw t5, a3, -512
-; NOREMAT-NEXT:    add a4, a0, t5
-; NOREMAT-NEXT:    vle32.v v6, (a4)
-; NOREMAT-NEXT:    vle32.v v16, (a4)
-; NOREMAT-NEXT:    csrr a4, vlenb
-; NOREMAT-NEXT:    slli a4, a4, 2
-; NOREMAT-NEXT:    add a4, sp, a4
-; NOREMAT-NEXT:    addi a4, a4, 288
-; NOREMAT-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
-; NOREMAT-NEXT:    add a4, a0, a3
-; NOREMAT-NEXT:    vle32.v v26, (a4)
-; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    csrr a4, vlenb
-; NOREMAT-NEXT:    slli a4, a4, 1
-; NOREMAT-NEXT:    add a4, sp, a4
-; NOREMAT-NEXT:    addi a4, a4, 288
-; NOREMAT-NEXT:    vl2r.v v10, (a4) # Unknown-size Folded Reload
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
-; NOREMAT-NEXT:    addiw t4, a3, 512
-; NOREMAT-NEXT:    add a4, a0, t4
-; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    vle32.v v24, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v18, v30
-; NOREMAT-NEXT:    slli t3, t2, 10
-; NOREMAT-NEXT:    add a4, a0, t3
-; NOREMAT-NEXT:    vle32.v v18, (a4)
-; NOREMAT-NEXT:    vle32.v v28, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v4
-; NOREMAT-NEXT:    addiw t2, a3, 1536
-; NOREMAT-NEXT:    add a4, a0, t2
-; NOREMAT-NEXT:    vle32.v v20, (a4)
-; NOREMAT-NEXT:    vle32.v v30, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v2
-; NOREMAT-NEXT:    slli t1, a2, 11
-; NOREMAT-NEXT:    add a2, a0, t1
+; NOREMAT-NEXT:    slli a2, t0, 11
+; NOREMAT-NEXT:    sd a2, 544(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v12
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v12, (a2)
+; NOREMAT-NEXT:    vle32.v v26, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
+; NOREMAT-NEXT:    li t0, 29
+; NOREMAT-NEXT:    slli a2, t0, 9
+; NOREMAT-NEXT:    sd a2, 536(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    vle32.v v6, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v28
+; NOREMAT-NEXT:    li a3, 15
+; NOREMAT-NEXT:    slli a2, a3, 10
+; NOREMAT-NEXT:    sd a2, 528(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v28, (a2)
+; NOREMAT-NEXT:    vle32.v v30, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v12
+; NOREMAT-NEXT:    li t1, 31
+; NOREMAT-NEXT:    slli a2, t1, 9
+; NOREMAT-NEXT:    sd a2, 520(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v0
-; NOREMAT-NEXT:    lui a2, 8
-; NOREMAT-NEXT:    addiw a7, a2, -1536
-; NOREMAT-NEXT:    add a4, a0, a7
-; NOREMAT-NEXT:    vle32.v v22, (a4)
-; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
-; NOREMAT-NEXT:    slli a6, a5, 10
-; NOREMAT-NEXT:    add a4, a0, a6
-; NOREMAT-NEXT:    vle32.v v14, (a4)
-; NOREMAT-NEXT:    vle32.v v6, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v26
-; NOREMAT-NEXT:    addiw a5, a2, -512
-; NOREMAT-NEXT:    add a4, a0, a5
-; NOREMAT-NEXT:    vle32.v v16, (a4)
-; NOREMAT-NEXT:    vle32.v v26, (a4)
-; NOREMAT-NEXT:    add a0, a0, a2
-; NOREMAT-NEXT:    vle32.v v0, (a0)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v18
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v20
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v8
+; NOREMAT-NEXT:    lui a4, 4
+; NOREMAT-NEXT:    addiw a0, a4, 512
+; NOREMAT-NEXT:    sd a0, 496(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a0, a7, a0
+; NOREMAT-NEXT:    vle32.v v8, (a0)
+; NOREMAT-NEXT:    vle32.v v26, (a0)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v28
+; NOREMAT-NEXT:    slli a2, s1, 10
+; NOREMAT-NEXT:    sd a2, 488(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v28, (a2)
+; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
+; NOREMAT-NEXT:    addiw a2, a4, 1536
+; NOREMAT-NEXT:    sd a2, 480(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v12, (a2)
+; NOREMAT-NEXT:    vle32.v v30, (a2)
+; NOREMAT-NEXT:    slli a2, a5, 11
+; NOREMAT-NEXT:    sd a2, 472(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v24
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v24, (a2)
+; NOREMAT-NEXT:    vle32.v v4, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v8
+; NOREMAT-NEXT:    lui a5, 5
+; NOREMAT-NEXT:    addiw a2, a5, -1536
+; NOREMAT-NEXT:    sd a2, 464(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    vle32.v v22, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v28
+; NOREMAT-NEXT:    slli a2, t2, 10
+; NOREMAT-NEXT:    sd a2, 456(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    li t2, 19
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v26, (a2)
+; NOREMAT-NEXT:    vle32.v v28, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
+; NOREMAT-NEXT:    addiw a2, a5, -512
+; NOREMAT-NEXT:    sd a2, 448(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v12, (a2)
+; NOREMAT-NEXT:    vle32.v v6, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v24
+; NOREMAT-NEXT:    addiw a2, a5, 512
+; NOREMAT-NEXT:    sd a2, 440(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v24, (a2)
+; NOREMAT-NEXT:    vle32.v v30, (a2)
+; NOREMAT-NEXT:    slli a2, s7, 10
+; NOREMAT-NEXT:    sd a2, 432(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    vle32.v v4, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v26
+; NOREMAT-NEXT:    addiw a2, a5, 1536
+; NOREMAT-NEXT:    sd a2, 424(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v22, (a2)
+; NOREMAT-NEXT:    vle32.v v26, (a2)
+; NOREMAT-NEXT:    slli a2, a6, 11
+; NOREMAT-NEXT:    sd a2, 416(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v12
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v12, (a2)
+; NOREMAT-NEXT:    vle32.v v28, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v18
+; NOREMAT-NEXT:    lui a6, 6
+; NOREMAT-NEXT:    addiw a2, a6, -1536
+; NOREMAT-NEXT:    sd a2, 408(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v18, (a2)
+; NOREMAT-NEXT:    vle32.v v6, (a2)
+; NOREMAT-NEXT:    slli a2, s3, 10
+; NOREMAT-NEXT:    sd a2, 400(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v24
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v16, (a2)
+; NOREMAT-NEXT:    vle32.v v24, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
+; NOREMAT-NEXT:    addiw a2, a6, -512
+; NOREMAT-NEXT:    sd a2, 392(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v22
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v14
+; NOREMAT-NEXT:    addiw a2, a6, 512
+; NOREMAT-NEXT:    sd a2, 384(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v22, (a2)
+; NOREMAT-NEXT:    vle32.v v4, (a2)
+; NOREMAT-NEXT:    slli a2, s0, 10
+; NOREMAT-NEXT:    sd a2, 376(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v12
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v26, (a2)
+; NOREMAT-NEXT:    vle32.v v2, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v18
+; NOREMAT-NEXT:    addiw a2, a6, 1536
+; NOREMAT-NEXT:    sd a2, 368(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v18, (a2)
+; NOREMAT-NEXT:    vle32.v v28, (a2)
+; NOREMAT-NEXT:    slli a2, t5, 11
+; NOREMAT-NEXT:    sd a2, 360(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v16
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v0
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v16, (a2)
+; NOREMAT-NEXT:    vle32.v v6, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v8
+; NOREMAT-NEXT:    lui s0, 7
+; NOREMAT-NEXT:    addiw a2, s0, -1536
+; NOREMAT-NEXT:    sd a2, 352(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    vle32.v v24, (a2)
+; NOREMAT-NEXT:    slli a2, t3, 10
+; NOREMAT-NEXT:    sd a2, 344(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v14
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v14, (a2)
+; NOREMAT-NEXT:    vle32.v v30, (a2)
+; NOREMAT-NEXT:    addi a0, sp, 640
+; NOREMAT-NEXT:    vl2r.v v12, (a0) # Unknown-size Folded Reload
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v22
+; NOREMAT-NEXT:    addiw a2, s0, -512
+; NOREMAT-NEXT:    sd a2, 336(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v22, (a2)
+; NOREMAT-NEXT:    vle32.v v12, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v26
+; NOREMAT-NEXT:    addiw a2, s0, 512
+; NOREMAT-NEXT:    sd a2, 328(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    lui t3, 7
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v26, (a2)
+; NOREMAT-NEXT:    vle32.v v4, (a2)
+; NOREMAT-NEXT:    slli a2, t0, 10
+; NOREMAT-NEXT:    sd a2, 320(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v18
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v18, (a2)
+; NOREMAT-NEXT:    vle32.v v2, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v16
+; NOREMAT-NEXT:    addiw a2, t3, 1536
+; NOREMAT-NEXT:    sd a2, 312(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v16, (a2)
+; NOREMAT-NEXT:    vle32.v v28, (a2)
+; NOREMAT-NEXT:    slli a2, a3, 11
+; NOREMAT-NEXT:    sd a2, 304(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    vle32.v v6, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v14
+; NOREMAT-NEXT:    addiw a2, t4, -1536
+; NOREMAT-NEXT:    sd a2, 296(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v14, (a2)
+; NOREMAT-NEXT:    vle32.v v24, (a2)
+; NOREMAT-NEXT:    slli a2, t1, 10
+; NOREMAT-NEXT:    sd a2, 288(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v22
+; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    vle32.v v22, (a2)
+; NOREMAT-NEXT:    vle32.v v30, (a2)
+; NOREMAT-NEXT:    addiw a0, t4, -512
+; NOREMAT-NEXT:    sd a0, 280(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a0, a7, a0
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v0
+; NOREMAT-NEXT:    vle32.v v12, (a0)
+; NOREMAT-NEXT:    vle32.v v0, (a0)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v26
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v18
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v16
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v8
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v14
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v22
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v0, v20
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    addi a0, a1, 1024
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    add s11, a1, s11
+; NOREMAT-NEXT:    sd s11, 272(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    lui a0, 2
+; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    sd a0, 264(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    lui a0, 3
+; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    sd a0, 256(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a1, a4
+; NOREMAT-NEXT:    sd a4, 248(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a5, a1, a5
+; NOREMAT-NEXT:    sd a5, 240(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a6, a1, a6
+; NOREMAT-NEXT:    sd a6, 232(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add t3, a1, t3
+; NOREMAT-NEXT:    sd t3, 224(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a0, a1, t4
+; NOREMAT-NEXT:    sd a0, 216(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addiw a0, t4, 512
+; NOREMAT-NEXT:    sd a0, 192(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addiw a0, t4, 1024
+; NOREMAT-NEXT:    sd a0, 176(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addiw a0, t4, 1536
+; NOREMAT-NEXT:    sd a0, 160(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    slli s1, s1, 11
+; NOREMAT-NEXT:    sd s1, 128(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    lui a0, 9
+; NOREMAT-NEXT:    addiw a2, a0, -1536
+; NOREMAT-NEXT:    sd a2, 88(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addiw a2, a0, -1024
+; NOREMAT-NEXT:    sd a2, 72(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addiw a2, a0, -512
+; NOREMAT-NEXT:    sd a2, 40(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a1, a0
+; NOREMAT-NEXT:    sd a2, 208(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addiw s11, a0, 512
+; NOREMAT-NEXT:    addiw s7, a0, 1024
+; NOREMAT-NEXT:    addiw s3, a0, 1536
+; NOREMAT-NEXT:    slli s1, t2, 11
+; NOREMAT-NEXT:    lui a0, 10
+; NOREMAT-NEXT:    addiw t2, a0, -1536
+; NOREMAT-NEXT:    addiw a7, a0, -1024
+; NOREMAT-NEXT:    addiw a4, a0, -512
+; NOREMAT-NEXT:    add a2, a1, a0
+; NOREMAT-NEXT:    sd a2, 200(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addiw a0, a0, 512
+; NOREMAT-NEXT:    ld a2, 512(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add a2, a1, a2
+; NOREMAT-NEXT:    ld a3, 504(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add a3, a1, a3
+; NOREMAT-NEXT:    add a5, a1, t6
+; NOREMAT-NEXT:    add a6, a1, s2
+; NOREMAT-NEXT:    add t0, a1, s4
+; NOREMAT-NEXT:    add t1, a1, s5
+; NOREMAT-NEXT:    add t3, a1, s6
+; NOREMAT-NEXT:    add t4, a1, s8
+; NOREMAT-NEXT:    add t5, a1, s9
+; NOREMAT-NEXT:    add t6, a1, s10
+; NOREMAT-NEXT:    add s0, a1, ra
+; NOREMAT-NEXT:    ld s2, 624(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add s2, a1, s2
+; NOREMAT-NEXT:    ld s4, 616(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add s4, a1, s4
+; NOREMAT-NEXT:    ld s5, 608(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add s5, a1, s5
+; NOREMAT-NEXT:    ld s6, 600(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add s6, a1, s6
+; NOREMAT-NEXT:    ld s8, 592(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add s8, a1, s8
+; NOREMAT-NEXT:    ld s9, 584(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add s9, a1, s9
+; NOREMAT-NEXT:    ld s10, 576(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add s10, a1, s10
+; NOREMAT-NEXT:    ld ra, 568(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 16(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 560(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 552(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 32(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 544(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 48(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 536(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 528(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 64(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 520(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 80(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 496(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 96(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 488(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 104(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 480(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 112(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 472(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 464(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 456(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 144(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 448(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 152(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 440(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 168(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 432(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 184(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 424(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 424(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 416(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 432(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 408(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 440(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 400(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 448(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 392(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 456(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 384(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 464(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 472(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 368(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 480(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 360(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 488(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 352(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 496(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 344(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 504(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 336(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 512(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 328(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 520(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 320(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 528(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 312(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 536(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 304(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 544(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 296(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 552(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 288(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 560(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 280(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 568(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 192(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 576(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 176(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 584(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 160(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 592(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 128(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 600(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 88(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 608(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 616(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    sd ra, 624(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add ra, a1, s11
+; NOREMAT-NEXT:    add s11, a1, s7
+; NOREMAT-NEXT:    add s7, a1, s3
+; NOREMAT-NEXT:    add s3, a1, s1
+; NOREMAT-NEXT:    add s1, a1, t2
+; NOREMAT-NEXT:    add t2, a1, a7
+; NOREMAT-NEXT:    add a7, a1, a4
+; NOREMAT-NEXT:    add a4, a1, a0
 ; NOREMAT-NEXT:    addi a0, a1, 1536
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (a2)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (a3)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (a5)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (a6)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 272(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (t0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (t1)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (t3)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (t4)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (t5)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (t6)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (s0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 264(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 256(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    vse32.v v8, (s2)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 248(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    vse32.v v8, (s4)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    lui a0, 1
-; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    vse32.v v8, (s5)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 240(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    vse32.v v8, (s6)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 232(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (s8)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (s9)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (s10)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    ld a0, 256(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 224(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 216(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 208(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 200(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 192(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 56(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    lui a0, 2
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 64(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 184(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 80(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 176(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 248(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 168(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 96(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 160(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 104(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 152(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 112(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 144(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 120(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 136(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    lui a0, 3
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 144(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 128(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 152(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 120(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 240(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 112(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 168(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 104(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 184(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 96(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 424(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 88(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 432(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 80(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 440(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    lui a0, 4
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 448(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 72(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 456(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 64(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 232(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 56(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 464(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 472(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 480(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 488(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    ld a0, 496(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s1, a1, s1
-; NOREMAT-NEXT:    vse32.v v8, (s1)
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add ra, a1, ra
-; NOREMAT-NEXT:    vse32.v v8, (ra)
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s11, a1, s11
-; NOREMAT-NEXT:    vse32.v v8, (s11)
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s10, a1, s10
-; NOREMAT-NEXT:    vse32.v v8, (s10)
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s9, a1, s9
-; NOREMAT-NEXT:    vse32.v v8, (s9)
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s8, a1, s8
-; NOREMAT-NEXT:    vse32.v v8, (s8)
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s7, a1, s7
-; NOREMAT-NEXT:    vse32.v v8, (s7)
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s6, a1, s6
-; NOREMAT-NEXT:    vse32.v v8, (s6)
+; NOREMAT-NEXT:    ld a0, 504(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add t0, a1, t0
-; NOREMAT-NEXT:    vse32.v v8, (t0)
+; NOREMAT-NEXT:    ld a0, 512(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s5, a1, s5
-; NOREMAT-NEXT:    vse32.v v8, (s5)
+; NOREMAT-NEXT:    ld a0, 224(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s4, a1, s4
-; NOREMAT-NEXT:    vse32.v v8, (s4)
+; NOREMAT-NEXT:    ld a0, 520(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s3, a1, s3
-; NOREMAT-NEXT:    vse32.v v8, (s3)
+; NOREMAT-NEXT:    ld a0, 528(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s2, a1, s2
-; NOREMAT-NEXT:    vse32.v v8, (s2)
+; NOREMAT-NEXT:    ld a0, 536(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add s0, a1, s0
-; NOREMAT-NEXT:    vse32.v v8, (s0)
+; NOREMAT-NEXT:    ld a0, 544(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add t6, a1, t6
-; NOREMAT-NEXT:    vse32.v v8, (t6)
+; NOREMAT-NEXT:    ld a0, 552(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add t5, a1, t5
-; NOREMAT-NEXT:    vse32.v v8, (t5)
+; NOREMAT-NEXT:    ld a0, 560(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add a3, a1, a3
-; NOREMAT-NEXT:    vse32.v v8, (a3)
+; NOREMAT-NEXT:    ld a0, 568(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add t4, a1, t4
-; NOREMAT-NEXT:    vse32.v v8, (t4)
+; NOREMAT-NEXT:    ld a0, 216(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add t3, a1, t3
-; NOREMAT-NEXT:    vse32.v v8, (t3)
+; NOREMAT-NEXT:    ld a0, 576(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add t2, a1, t2
-; NOREMAT-NEXT:    vse32.v v8, (t2)
+; NOREMAT-NEXT:    ld a0, 584(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add t1, a1, t1
-; NOREMAT-NEXT:    vse32.v v8, (t1)
+; NOREMAT-NEXT:    ld a0, 592(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add a7, a1, a7
-; NOREMAT-NEXT:    vse32.v v8, (a7)
+; NOREMAT-NEXT:    ld a0, 600(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add a6, a1, a6
-; NOREMAT-NEXT:    vse32.v v8, (a6)
+; NOREMAT-NEXT:    ld a0, 608(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    add a5, a1, a5
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; NOREMAT-NEXT:    vse32.v v8, (a5)
-; NOREMAT-NEXT:    add a0, a1, a2
+; NOREMAT-NEXT:    ld a0, 616(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
-; NOREMAT-NEXT:    addiw a0, a2, 512
-; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
+; NOREMAT-NEXT:    ld a0, 624(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    addiw a0, a2, 1024
-; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    addiw a0, a2, 1536
-; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
+; NOREMAT-NEXT:    ld a0, 208(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    li a0, 17
-; NOREMAT-NEXT:    slli a0, a0, 11
-; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
-; NOREMAT-NEXT:    lui a0, 9
-; NOREMAT-NEXT:    addiw a2, a0, -1536
-; NOREMAT-NEXT:    add a2, a1, a2
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; NOREMAT-NEXT:    vse32.v v8, (a2)
-; NOREMAT-NEXT:    addiw a2, a0, -1024
-; NOREMAT-NEXT:    add a2, a1, a2
+; NOREMAT-NEXT:    vse32.v v8, (ra)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a2)
-; NOREMAT-NEXT:    addiw a2, a0, -512
-; NOREMAT-NEXT:    add a2, a1, a2
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; NOREMAT-NEXT:    vse32.v v8, (a2)
-; NOREMAT-NEXT:    add a2, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (s11)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a2)
-; NOREMAT-NEXT:    addiw a2, a0, 512
-; NOREMAT-NEXT:    add a2, a1, a2
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; NOREMAT-NEXT:    vse32.v v8, (a2)
-; NOREMAT-NEXT:    addiw a2, a0, 1024
-; NOREMAT-NEXT:    add a2, a1, a2
-; NOREMAT-NEXT:    vse32.v v10, (a2)
+; NOREMAT-NEXT:    vse32.v v8, (s7)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    addiw a0, a0, 1536
-; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    li a0, 19
-; NOREMAT-NEXT:    slli a0, a0, 11
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (s3)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
-; NOREMAT-NEXT:    lui a0, 10
-; NOREMAT-NEXT:    addiw a2, a0, -1536
-; NOREMAT-NEXT:    add a2, a1, a2
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; NOREMAT-NEXT:    vse32.v v8, (a2)
-; NOREMAT-NEXT:    addiw a2, a0, -1024
-; NOREMAT-NEXT:    add a2, a1, a2
+; NOREMAT-NEXT:    vse32.v v8, (s1)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a2)
-; NOREMAT-NEXT:    addiw a2, a0, -512
-; NOREMAT-NEXT:    add a2, a1, a2
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; NOREMAT-NEXT:    vse32.v v8, (a2)
-; NOREMAT-NEXT:    add a2, a1, a0
-; NOREMAT-NEXT:    vse32.v v10, (a2)
-; NOREMAT-NEXT:    addiw a0, a0, 512
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (t2)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (a7)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    ld a0, 200(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v8, (a4)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    csrr a0, vlenb
-; NOREMAT-NEXT:    li a1, 6
-; NOREMAT-NEXT:    mul a0, a0, a1
+; NOREMAT-NEXT:    slli a0, a0, 1
 ; NOREMAT-NEXT:    add sp, sp, a0
-; NOREMAT-NEXT:    .cfi_def_cfa sp, 400
-; NOREMAT-NEXT:    ld ra, 392(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s0, 384(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s1, 376(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s2, 368(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s3, 360(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s4, 352(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s5, 344(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s6, 336(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s7, 328(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s8, 320(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s9, 312(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s10, 304(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s11, 296(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    .cfi_def_cfa sp, 752
+; NOREMAT-NEXT:    ld ra, 744(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s0, 736(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s1, 728(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s2, 720(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s3, 712(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s4, 704(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s5, 696(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s6, 688(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s7, 680(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s8, 672(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s9, 664(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s10, 656(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s11, 648(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    .cfi_restore ra
 ; NOREMAT-NEXT:    .cfi_restore s0
 ; NOREMAT-NEXT:    .cfi_restore s1
@@ -790,27 +888,27 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    .cfi_restore s9
 ; NOREMAT-NEXT:    .cfi_restore s10
 ; NOREMAT-NEXT:    .cfi_restore s11
-; NOREMAT-NEXT:    addi sp, sp, 400
+; NOREMAT-NEXT:    addi sp, sp, 752
 ; NOREMAT-NEXT:    .cfi_def_cfa_offset 0
 ; NOREMAT-NEXT:    ret
 ;
 ; REMAT-LABEL: test:
 ; REMAT:       # %bb.0:
-; REMAT-NEXT:    addi sp, sp, -112
-; REMAT-NEXT:    .cfi_def_cfa_offset 112
-; REMAT-NEXT:    sd ra, 104(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s0, 96(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s1, 88(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s2, 80(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s3, 72(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s4, 64(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s5, 56(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s6, 48(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s7, 40(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s8, 32(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s9, 24(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s10, 16(sp) # 8-byte Folded Spill
-; REMAT-NEXT:    sd s11, 8(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    addi sp, sp, -544
+; REMAT-NEXT:    .cfi_def_cfa_offset 544
+; REMAT-NEXT:    sd ra, 536(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s0, 528(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s1, 520(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s2, 512(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s3, 504(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s4, 496(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s5, 488(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s6, 480(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s7, 472(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s8, 464(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s9, 456(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s10, 448(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    sd s11, 440(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    .cfi_offset ra, -8
 ; REMAT-NEXT:    .cfi_offset s0, -16
 ; REMAT-NEXT:    .cfi_offset s1, -24
@@ -824,730 +922,980 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    .cfi_offset s9, -88
 ; REMAT-NEXT:    .cfi_offset s10, -96
 ; REMAT-NEXT:    .cfi_offset s11, -104
-; REMAT-NEXT:    li a2, 32
-; REMAT-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
-; REMAT-NEXT:    vle32.v v8, (a0)
-; REMAT-NEXT:    addi a2, a0, 512
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    addi a2, a0, 1024
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    li a3, 18
+; REMAT-NEXT:    mul a2, a2, a3
+; REMAT-NEXT:    sub sp, sp, a2
+; REMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xa0, 0x04, 0x22, 0x11, 0x12, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 544 + 18 * vlenb
+; REMAT-NEXT:    li a4, 32
+; REMAT-NEXT:    addi a5, a0, 512
+; REMAT-NEXT:    addi a3, a0, 1024
 ; REMAT-NEXT:    addi a2, a0, 1536
+; REMAT-NEXT:    li a6, 1
+; REMAT-NEXT:    slli a6, a6, 11
+; REMAT-NEXT:    li a7, 5
+; REMAT-NEXT:    slli a7, a7, 9
+; REMAT-NEXT:    li t0, 3
+; REMAT-NEXT:    slli t0, t0, 10
+; REMAT-NEXT:    li t1, 7
+; REMAT-NEXT:    slli t1, t1, 9
+; REMAT-NEXT:    lui t2, 1
+; REMAT-NEXT:    li t3, 9
+; REMAT-NEXT:    slli t3, t3, 9
+; REMAT-NEXT:    li t4, 5
+; REMAT-NEXT:    slli t4, t4, 10
+; REMAT-NEXT:    li t5, 11
+; REMAT-NEXT:    slli t5, t5, 9
+; REMAT-NEXT:    li t6, 3
+; REMAT-NEXT:    slli t6, t6, 11
+; REMAT-NEXT:    li s0, 13
+; REMAT-NEXT:    slli s0, s0, 9
+; REMAT-NEXT:    li s1, 7
+; REMAT-NEXT:    slli s1, s1, 10
+; REMAT-NEXT:    li s2, 15
+; REMAT-NEXT:    slli s2, s2, 9
+; REMAT-NEXT:    lui s3, 2
+; REMAT-NEXT:    li s4, 17
+; REMAT-NEXT:    slli s4, s4, 9
+; REMAT-NEXT:    li s5, 9
+; REMAT-NEXT:    slli s5, s5, 10
+; REMAT-NEXT:    li s6, 19
+; REMAT-NEXT:    slli s6, s6, 9
+; REMAT-NEXT:    li s7, 5
+; REMAT-NEXT:    slli s7, s7, 11
+; REMAT-NEXT:    li s8, 21
+; REMAT-NEXT:    slli s8, s8, 9
+; REMAT-NEXT:    li s9, 11
+; REMAT-NEXT:    slli s9, s9, 10
+; REMAT-NEXT:    li s10, 23
+; REMAT-NEXT:    slli s10, s10, 9
+; REMAT-NEXT:    lui s11, 3
+; REMAT-NEXT:    li ra, 25
+; REMAT-NEXT:    slli ra, ra, 9
+; REMAT-NEXT:    vsetvli zero, a4, e32, m2, ta, ma
+; REMAT-NEXT:    vle32.v v8, (a5)
+; REMAT-NEXT:    li a4, 13
+; REMAT-NEXT:    slli a4, a4, 10
+; REMAT-NEXT:    vle32.v v10, (a3)
+; REMAT-NEXT:    vle32.v v12, (a3)
+; REMAT-NEXT:    li a3, 27
+; REMAT-NEXT:    slli a3, a3, 9
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
+; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    add a2, a0, a6
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    vle32.v v20, (a2)
+; REMAT-NEXT:    add a2, a0, a7
+; REMAT-NEXT:    vle32.v v22, (a2)
+; REMAT-NEXT:    vle32.v v24, (a2)
+; REMAT-NEXT:    add a2, a0, t0
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    add a2, a0, t1
+; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    slli a2, a2, 4
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    add a2, a0, t2
+; REMAT-NEXT:    vle32.v v4, (a0)
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    li a5, 14
+; REMAT-NEXT:    mul a2, a2, a5
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    add a2, a0, t3
+; REMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
+; REMAT-NEXT:    vle32.v v4, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    add a2, a0, t4
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a2, 1
-; REMAT-NEXT:    slli a2, a2, 11
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 5
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v14
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    add a2, a0, t5
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a2, 3
-; REMAT-NEXT:    slli a2, a2, 10
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v18
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 7
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    lui a2, 1
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    li a5, 12
+; REMAT-NEXT:    mul a2, a2, a5
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    add a2, a0, t6
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v22
+; REMAT-NEXT:    vle32.v v20, (a2)
+; REMAT-NEXT:    add a2, a0, s0
+; REMAT-NEXT:    vle32.v v22, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v24, v26
+; REMAT-NEXT:    vle32.v v24, (a2)
+; REMAT-NEXT:    add a2, a0, s1
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v30
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    add a2, a0, s2
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 9
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a2, 5
-; REMAT-NEXT:    slli a2, a2, 10
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    csrr a5, vlenb
+; REMAT-NEXT:    slli a5, a5, 4
+; REMAT-NEXT:    add a5, sp, a5
+; REMAT-NEXT:    addi a5, a5, 432
+; REMAT-NEXT:    vl2r.v v12, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v2
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    add a2, a0, s3
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 11
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
+; REMAT-NEXT:    csrr a5, vlenb
+; REMAT-NEXT:    li a6, 14
+; REMAT-NEXT:    mul a5, a5, a6
+; REMAT-NEXT:    add a5, sp, a5
+; REMAT-NEXT:    addi a5, a5, 432
+; REMAT-NEXT:    vl2r.v v16, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
+; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    add a2, a0, s4
+; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v6, v10
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    add a2, a0, s5
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a2, 3
-; REMAT-NEXT:    slli a2, a2, 11
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 13
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v14
+; REMAT-NEXT:    vle32.v v4, (a2)
+; REMAT-NEXT:    add a2, a0, s6
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a2, 7
-; REMAT-NEXT:    slli a2, a2, 10
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
+; REMAT-NEXT:    csrr a5, vlenb
+; REMAT-NEXT:    li a6, 12
+; REMAT-NEXT:    mul a5, a5, a6
+; REMAT-NEXT:    add a5, sp, a5
+; REMAT-NEXT:    addi a5, a5, 432
+; REMAT-NEXT:    vl2r.v v0, (a5) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v18
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    add a2, a0, s7
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v22
+; REMAT-NEXT:    vle32.v v22, (a2)
+; REMAT-NEXT:    add a2, a0, s8
+; REMAT-NEXT:    vle32.v v20, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v24, v26
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    add a2, a0, s9
+; REMAT-NEXT:    vle32.v v24, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v8
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    add a2, a0, s10
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 15
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    lui a2, 2
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    sf.vc.vv 3, 0, v2, v12
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 17
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a2, 9
-; REMAT-NEXT:    slli a2, a2, 10
-; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    slli a2, a2, 3
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v12, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    add a2, a0, s11
 ; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 19
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
+; REMAT-NEXT:    sf.vc.vv 3, 0, v30, v16
+; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    add a2, a0, ra
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v6, v10
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a2, 5
-; REMAT-NEXT:    slli a2, a2, 11
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    li a2, 21
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    slli a2, a2, 1
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    add a2, a0, a4
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    li a2, 11
-; REMAT-NEXT:    slli a2, a2, 10
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v26, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    li a2, 23
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v4, v14
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    lui a2, 3
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v30, (a2)
-; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    li a2, 25
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v6, (a2)
-; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    li a2, 13
-; REMAT-NEXT:    slli a2, a2, 10
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    li a2, 27
-; REMAT-NEXT:    slli a2, a2, 9
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v2, (a2)
-; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    li a2, 7
-; REMAT-NEXT:    slli a2, a2, 11
-; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v24, (a2)
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v26
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    slli a2, a2, 2
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v14, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    add a2, a0, a3
+; REMAT-NEXT:    vle32.v v14, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v18
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    slli a2, a2, 4
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v18, (a2) # Unknown-size Folded Spill
+; REMAT-NEXT:    li a5, 7
+; REMAT-NEXT:    slli a5, a5, 11
+; REMAT-NEXT:    add a2, a0, a5
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    addi a3, sp, 432
+; REMAT-NEXT:    vs2r.v v18, (a3) # Unknown-size Folded Spill
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v20
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    li a3, 14
+; REMAT-NEXT:    mul a2, a2, a3
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v18, (a2) # Unknown-size Folded Spill
 ; REMAT-NEXT:    li a2, 29
 ; REMAT-NEXT:    slli a2, a2, 9
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v26, (a2)
-; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v28
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v26, v24
+; REMAT-NEXT:    vle32.v v20, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    li a3, 12
+; REMAT-NEXT:    mul a2, a2, a3
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v20, (a2) # Unknown-size Folded Spill
 ; REMAT-NEXT:    li a2, 15
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v28, (a2)
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v30
+; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v8
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    li a3, 10
+; REMAT-NEXT:    mul a2, a2, a3
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
 ; REMAT-NEXT:    li a2, 31
 ; REMAT-NEXT:    slli a2, a2, 9
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v30, (a2)
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v6
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    slli a3, a3, 3
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v12
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    slli a2, a2, 3
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
 ; REMAT-NEXT:    lui a2, 4
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v6, (a2)
-; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v4
+; REMAT-NEXT:    vle32.v v4, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v2
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    csrr a2, vlenb
+; REMAT-NEXT:    li a3, 6
+; REMAT-NEXT:    mul a2, a2, a3
+; REMAT-NEXT:    add a2, sp, a2
+; REMAT-NEXT:    addi a2, a2, 432
+; REMAT-NEXT:    vs2r.v v8, (a2) # Unknown-size Folded Spill
 ; REMAT-NEXT:    lui a2, 4
 ; REMAT-NEXT:    addiw a2, a2, 512
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v2
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    slli a3, a3, 1
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
+; REMAT-NEXT:    vle32.v v20, (a2)
 ; REMAT-NEXT:    li a2, 17
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v2, (a2)
-; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    slli a3, a3, 2
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
+; REMAT-NEXT:    vle32.v v22, (a2)
 ; REMAT-NEXT:    lui a2, 4
 ; REMAT-NEXT:    addiw a2, a2, 1536
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v24, (a2)
-; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    slli a3, a3, 4
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
+; REMAT-NEXT:    addi a3, sp, 432
+; REMAT-NEXT:    vl2r.v v10, (a3) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
+; REMAT-NEXT:    vle32.v v8, (a2)
 ; REMAT-NEXT:    li a2, 9
 ; REMAT-NEXT:    slli a2, a2, 11
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v26, (a2)
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    li a4, 14
+; REMAT-NEXT:    mul a3, a3, a4
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v10, (a3) # Unknown-size Folded Reload
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v18
+; REMAT-NEXT:    vle32.v v10, (a2)
 ; REMAT-NEXT:    lui a2, 5
 ; REMAT-NEXT:    addiw a2, a2, -1536
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v28, (a2)
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    li a4, 12
+; REMAT-NEXT:    mul a3, a3, a4
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v12, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
+; REMAT-NEXT:    vle32.v v12, (a2)
 ; REMAT-NEXT:    li a2, 19
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v30, (a2)
-; REMAT-NEXT:    vle32.v v12, (a2)
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    li a4, 10
+; REMAT-NEXT:    mul a3, a3, a4
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v14, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
-; REMAT-NEXT:    lui ra, 5
-; REMAT-NEXT:    addiw ra, ra, -512
-; REMAT-NEXT:    add a2, a0, ra
-; REMAT-NEXT:    vle32.v v6, (a2)
 ; REMAT-NEXT:    vle32.v v14, (a2)
+; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    addiw a2, a2, -512
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    slli a3, a3, 3
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v16, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
-; REMAT-NEXT:    lui s11, 5
-; REMAT-NEXT:    add a2, a0, s11
-; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    vle32.v v4, (a2)
+; REMAT-NEXT:    csrr a3, vlenb
+; REMAT-NEXT:    li a4, 6
+; REMAT-NEXT:    mul a3, a3, a4
+; REMAT-NEXT:    add a3, sp, a3
+; REMAT-NEXT:    addi a3, a3, 432
+; REMAT-NEXT:    vl2r.v v18, (a3) # Unknown-size Folded Reload
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
-; REMAT-NEXT:    lui s10, 5
-; REMAT-NEXT:    addiw s10, s10, 512
-; REMAT-NEXT:    add a2, a0, s10
-; REMAT-NEXT:    vle32.v v2, (a2)
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v24
-; REMAT-NEXT:    li s9, 21
-; REMAT-NEXT:    slli s9, s9, 10
-; REMAT-NEXT:    add a2, a0, s9
-; REMAT-NEXT:    vle32.v v24, (a2)
+; REMAT-NEXT:    lui a2, 5
+; REMAT-NEXT:    addiw a2, a2, 512
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
 ; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v26
-; REMAT-NEXT:    lui s8, 5
-; REMAT-NEXT:    addiw s8, s8, 1536
-; REMAT-NEXT:    add a2, a0, s8
-; REMAT-NEXT:    vle32.v v26, (a2)
-; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v28
-; REMAT-NEXT:    li s7, 11
-; REMAT-NEXT:    slli s7, s7, 11
+; REMAT-NEXT:    li s7, 21
+; REMAT-NEXT:    slli s7, s7, 10
 ; REMAT-NEXT:    add a2, a0, s7
-; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
+; REMAT-NEXT:    vle32.v v22, (a2)
+; REMAT-NEXT:    lui s4, 5
+; REMAT-NEXT:    addiw s4, s4, 1536
+; REMAT-NEXT:    add a2, a0, s4
+; REMAT-NEXT:    vle32.v v24, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
 ; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v30
-; REMAT-NEXT:    lui s6, 6
-; REMAT-NEXT:    addiw s6, s6, -1536
-; REMAT-NEXT:    add a2, a0, s6
-; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    li a2, 11
+; REMAT-NEXT:    slli a2, a2, 11
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v6
-; REMAT-NEXT:    li s5, 23
-; REMAT-NEXT:    slli s5, s5, 10
-; REMAT-NEXT:    add a2, a0, s5
-; REMAT-NEXT:    vle32.v v6, (a2)
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v4
-; REMAT-NEXT:    lui s4, 6
-; REMAT-NEXT:    addiw s4, s4, -512
-; REMAT-NEXT:    add a2, a0, s4
-; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v2
 ; REMAT-NEXT:    lui s3, 6
+; REMAT-NEXT:    addiw s3, s3, -1536
 ; REMAT-NEXT:    add a2, a0, s3
-; REMAT-NEXT:    vle32.v v2, (a2)
-; REMAT-NEXT:    vle32.v v16, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v24
-; REMAT-NEXT:    lui s2, 6
-; REMAT-NEXT:    addiw s2, s2, 512
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
+; REMAT-NEXT:    vle32.v v12, (a2)
+; REMAT-NEXT:    li s2, 23
+; REMAT-NEXT:    slli s2, s2, 10
 ; REMAT-NEXT:    add a2, a0, s2
-; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
+; REMAT-NEXT:    vle32.v v14, (a2)
+; REMAT-NEXT:    lui a2, 6
+; REMAT-NEXT:    addiw a2, a2, -512
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
+; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    lui a2, 6
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    lui s1, 6
+; REMAT-NEXT:    vle32.v v4, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v26
-; REMAT-NEXT:    li s1, 25
-; REMAT-NEXT:    slli s1, s1, 10
-; REMAT-NEXT:    add a2, a0, s1
-; REMAT-NEXT:    vle32.v v26, (a2)
-; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v28
 ; REMAT-NEXT:    lui s0, 6
-; REMAT-NEXT:    addiw s0, s0, 1536
+; REMAT-NEXT:    addiw s0, s0, 512
 ; REMAT-NEXT:    add a2, a0, s0
-; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
+; REMAT-NEXT:    vle32.v v20, (a2)
+; REMAT-NEXT:    li a2, 25
+; REMAT-NEXT:    slli a2, a2, 10
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v30
-; REMAT-NEXT:    li t6, 13
-; REMAT-NEXT:    slli t6, t6, 11
+; REMAT-NEXT:    lui t6, 6
+; REMAT-NEXT:    addiw t6, t6, 1536
 ; REMAT-NEXT:    add a2, a0, t6
-; REMAT-NEXT:    vle32.v v30, (a2)
 ; REMAT-NEXT:    vle32.v v24, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v6
-; REMAT-NEXT:    lui t5, 7
-; REMAT-NEXT:    addiw t5, t5, -1536
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    li t5, 13
+; REMAT-NEXT:    slli t5, t5, 11
 ; REMAT-NEXT:    add a2, a0, t5
-; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
+; REMAT-NEXT:    lui a2, 7
+; REMAT-NEXT:    addiw a2, a2, -1536
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
+; REMAT-NEXT:    vle32.v v12, (a2)
 ; REMAT-NEXT:    li t4, 27
 ; REMAT-NEXT:    slli t4, t4, 10
 ; REMAT-NEXT:    add a2, a0, t4
-; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v2
-; REMAT-NEXT:    lui t3, 7
-; REMAT-NEXT:    addiw t3, t3, -512
-; REMAT-NEXT:    add a2, a0, t3
-; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v0
-; REMAT-NEXT:    lui t2, 7
-; REMAT-NEXT:    add a2, a0, t2
-; REMAT-NEXT:    vle32.v v0, (a2)
-; REMAT-NEXT:    vle32.v v8, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v26
-; REMAT-NEXT:    lui t1, 7
-; REMAT-NEXT:    addiw t1, t1, 512
-; REMAT-NEXT:    add a2, a0, t1
+; REMAT-NEXT:    lui a2, 7
+; REMAT-NEXT:    addiw a2, a2, -512
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    lui a2, 7
+; REMAT-NEXT:    add a2, a0, a2
+; REMAT-NEXT:    lui t3, 7
+; REMAT-NEXT:    vle32.v v4, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
 ; REMAT-NEXT:    vle32.v v18, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v28
-; REMAT-NEXT:    li t0, 29
-; REMAT-NEXT:    slli t0, t0, 10
-; REMAT-NEXT:    add a2, a0, t0
+; REMAT-NEXT:    lui t2, 7
+; REMAT-NEXT:    addiw t2, t2, 512
+; REMAT-NEXT:    add a2, a0, t2
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
 ; REMAT-NEXT:    vle32.v v20, (a2)
-; REMAT-NEXT:    vle32.v v26, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v30
-; REMAT-NEXT:    lui a7, 7
-; REMAT-NEXT:    addiw a7, a7, 1536
-; REMAT-NEXT:    add a2, a0, a7
+; REMAT-NEXT:    li t1, 29
+; REMAT-NEXT:    slli t1, t1, 10
+; REMAT-NEXT:    add a2, a0, t1
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
 ; REMAT-NEXT:    vle32.v v22, (a2)
-; REMAT-NEXT:    vle32.v v28, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v24, v6
-; REMAT-NEXT:    li a6, 15
-; REMAT-NEXT:    slli a6, a6, 11
-; REMAT-NEXT:    add a2, a0, a6
+; REMAT-NEXT:    lui t0, 7
+; REMAT-NEXT:    addiw t0, t0, 1536
+; REMAT-NEXT:    add a2, a0, t0
 ; REMAT-NEXT:    vle32.v v24, (a2)
-; REMAT-NEXT:    vle32.v v30, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v4
-; REMAT-NEXT:    lui a5, 8
-; REMAT-NEXT:    addiw a5, a5, -1536
-; REMAT-NEXT:    add a2, a0, a5
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    li a7, 15
+; REMAT-NEXT:    slli a7, a7, 11
+; REMAT-NEXT:    add a2, a0, a7
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    vle32.v v10, (a2)
-; REMAT-NEXT:    vle32.v v6, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v2
+; REMAT-NEXT:    lui a6, 8
+; REMAT-NEXT:    addiw a6, a6, -1536
+; REMAT-NEXT:    add a2, a0, a6
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
+; REMAT-NEXT:    vle32.v v12, (a2)
 ; REMAT-NEXT:    li a4, 31
 ; REMAT-NEXT:    slli a4, a4, 10
 ; REMAT-NEXT:    add a2, a0, a4
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    vle32.v v4, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v0
+; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
+; REMAT-NEXT:    vle32.v v14, (a2)
 ; REMAT-NEXT:    lui a3, 8
 ; REMAT-NEXT:    addiw a3, a3, -512
 ; REMAT-NEXT:    add a2, a0, a3
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
+; REMAT-NEXT:    vle32.v v16, (a2)
 ; REMAT-NEXT:    lui a2, 8
 ; REMAT-NEXT:    add a0, a0, a2
-; REMAT-NEXT:    vle32.v v0, (a0)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v16
-; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v20
-; REMAT-NEXT:    sf.vc.vv 3, 0, v26, v22
-; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v24
-; REMAT-NEXT:    sf.vc.vv 3, 0, v30, v10
-; REMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
-; REMAT-NEXT:    sf.vc.vv 3, 0, v4, v14
-; REMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
+; REMAT-NEXT:    vle32.v v4, (a0)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
+; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v0
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
+; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v6
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    addi a0, a1, 1024
 ; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    addi a0, a1, 1536
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    li a0, 1
 ; REMAT-NEXT:    slli a0, a0, 11
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 416(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 5
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 408(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 3
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 400(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 7
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 392(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    lui a0, 1
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 384(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 9
 ; REMAT-NEXT:    slli a0, a0, 9
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 5
-; REMAT-NEXT:    slli a0, a0, 10
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 11
-; REMAT-NEXT:    slli a0, a0, 9
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 3
-; REMAT-NEXT:    slli a0, a0, 11
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 13
-; REMAT-NEXT:    slli a0, a0, 9
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 7
-; REMAT-NEXT:    slli a0, a0, 10
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 15
-; REMAT-NEXT:    slli a0, a0, 9
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    lui a0, 2
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 17
-; REMAT-NEXT:    slli a0, a0, 9
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 9
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 376(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 5
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 19
+; REMAT-NEXT:    sd a0, 368(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 11
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 5
+; REMAT-NEXT:    sd a0, 360(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 3
 ; REMAT-NEXT:    slli a0, a0, 11
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 21
+; REMAT-NEXT:    sd a0, 352(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 13
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 11
+; REMAT-NEXT:    sd a0, 344(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 7
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 23
+; REMAT-NEXT:    sd a0, 336(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 15
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    lui a0, 3
+; REMAT-NEXT:    sd a0, 328(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui a0, 2
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 25
+; REMAT-NEXT:    sd a0, 320(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 17
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 312(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s5, a1, s5
+; REMAT-NEXT:    sd s5, 304(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s6, a1, s6
+; REMAT-NEXT:    sd s6, 296(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 5
+; REMAT-NEXT:    slli a0, a0, 11
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 288(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s8, a1, s8
+; REMAT-NEXT:    sd s8, 280(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s9, a1, s9
+; REMAT-NEXT:    sd s9, 272(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s10, a1, s10
+; REMAT-NEXT:    sd s10, 264(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s11, a1, s11
+; REMAT-NEXT:    sd s11, 256(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add ra, a1, ra
+; REMAT-NEXT:    sd ra, 248(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 13
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 240(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 27
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    li a0, 7
-; REMAT-NEXT:    slli a0, a0, 11
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 232(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add a5, a1, a5
+; REMAT-NEXT:    sd a5, 224(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 29
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 216(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 15
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 208(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 31
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 200(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    lui a0, 4
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 192(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    lui a0, 4
 ; REMAT-NEXT:    addiw a0, a0, 512
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 184(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 17
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 176(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    lui a0, 4
 ; REMAT-NEXT:    addiw a0, a0, 1536
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 168(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 9
 ; REMAT-NEXT:    slli a0, a0, 11
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 160(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    lui a0, 5
 ; REMAT-NEXT:    addiw a0, a0, -1536
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 152(sp) # 8-byte Folded Spill
 ; REMAT-NEXT:    li a0, 19
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    sd a0, 144(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui a0, 5
+; REMAT-NEXT:    addiw a0, a0, -512
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 136(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui a0, 5
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 128(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui a0, 5
+; REMAT-NEXT:    addiw a0, a0, 512
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 120(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s7, a1, s7
+; REMAT-NEXT:    sd s7, 112(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s4, a1, s4
+; REMAT-NEXT:    sd s4, 104(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 11
+; REMAT-NEXT:    slli a0, a0, 11
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 96(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s3, a1, s3
+; REMAT-NEXT:    sd s3, 88(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s2, a1, s2
+; REMAT-NEXT:    sd s2, 80(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui a0, 6
+; REMAT-NEXT:    addiw a0, a0, -512
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 72(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s1, a1, s1
+; REMAT-NEXT:    sd s1, 64(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add s0, a1, s0
+; REMAT-NEXT:    sd s0, 56(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    li a0, 25
+; REMAT-NEXT:    slli a0, a0, 10
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 48(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add t6, a1, t6
+; REMAT-NEXT:    sd t6, 40(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add t5, a1, t5
+; REMAT-NEXT:    sd t5, 32(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui a0, 7
+; REMAT-NEXT:    addiw a0, a0, -1536
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    sd a0, 24(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    add t4, a1, t4
+; REMAT-NEXT:    sd t4, 16(sp) # 8-byte Folded Spill
+; REMAT-NEXT:    lui ra, 7
+; REMAT-NEXT:    addiw ra, ra, -512
 ; REMAT-NEXT:    add ra, a1, ra
+; REMAT-NEXT:    add s11, a1, t3
+; REMAT-NEXT:    add s10, a1, t2
+; REMAT-NEXT:    add s9, a1, t1
+; REMAT-NEXT:    add s8, a1, t0
+; REMAT-NEXT:    add s7, a1, a7
+; REMAT-NEXT:    add s6, a1, a6
+; REMAT-NEXT:    add s5, a1, a4
+; REMAT-NEXT:    add s4, a1, a3
+; REMAT-NEXT:    add s3, a1, a2
+; REMAT-NEXT:    lui s2, 8
+; REMAT-NEXT:    addiw s2, s2, 512
+; REMAT-NEXT:    add s2, a1, s2
+; REMAT-NEXT:    lui s1, 8
+; REMAT-NEXT:    addiw s1, s1, 1024
+; REMAT-NEXT:    add s1, a1, s1
+; REMAT-NEXT:    lui s0, 8
+; REMAT-NEXT:    addiw s0, s0, 1536
+; REMAT-NEXT:    add s0, a1, s0
+; REMAT-NEXT:    li t6, 17
+; REMAT-NEXT:    slli t6, t6, 11
+; REMAT-NEXT:    add t6, a1, t6
+; REMAT-NEXT:    lui t5, 9
+; REMAT-NEXT:    addiw t5, t5, -1536
+; REMAT-NEXT:    add t5, a1, t5
+; REMAT-NEXT:    lui t4, 9
+; REMAT-NEXT:    addiw t4, t4, -1024
+; REMAT-NEXT:    add t4, a1, t4
+; REMAT-NEXT:    lui t3, 9
+; REMAT-NEXT:    addiw t3, t3, -512
+; REMAT-NEXT:    add t3, a1, t3
+; REMAT-NEXT:    lui t2, 9
+; REMAT-NEXT:    add t2, a1, t2
+; REMAT-NEXT:    lui t1, 9
+; REMAT-NEXT:    addiw t1, t1, 512
+; REMAT-NEXT:    add t1, a1, t1
+; REMAT-NEXT:    lui t0, 9
+; REMAT-NEXT:    addiw t0, t0, 1024
+; REMAT-NEXT:    add t0, a1, t0
+; REMAT-NEXT:    lui a7, 9
+; REMAT-NEXT:    addiw a7, a7, 1536
+; REMAT-NEXT:    add a7, a1, a7
+; REMAT-NEXT:    li a6, 19
+; REMAT-NEXT:    slli a6, a6, 11
+; REMAT-NEXT:    add a6, a1, a6
+; REMAT-NEXT:    lui a5, 10
+; REMAT-NEXT:    addiw a5, a5, -1536
+; REMAT-NEXT:    add a5, a1, a5
+; REMAT-NEXT:    lui a4, 10
+; REMAT-NEXT:    addiw a4, a4, -1024
+; REMAT-NEXT:    add a4, a1, a4
+; REMAT-NEXT:    lui a3, 10
+; REMAT-NEXT:    addiw a3, a3, -512
+; REMAT-NEXT:    add a3, a1, a3
+; REMAT-NEXT:    lui a2, 10
+; REMAT-NEXT:    add a2, a1, a2
+; REMAT-NEXT:    lui a0, 10
+; REMAT-NEXT:    addiw a0, a0, 512
+; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    addi a1, a1, 1536
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 416(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 408(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 400(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 392(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 384(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 376(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 368(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 360(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 352(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 344(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 336(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 328(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 320(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 312(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 304(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 296(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 288(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 280(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 272(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 264(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 256(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 248(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 240(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 232(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 224(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 216(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 208(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 200(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 192(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 184(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 176(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 168(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 160(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 152(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 144(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 136(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 128(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 120(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 112(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 104(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 96(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 88(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 80(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 72(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 64(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 56(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 48(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 40(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 32(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 24(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; REMAT-NEXT:    ld a1, 16(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    vse32.v v8, (a1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    vse32.v v8, (ra)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s11, a1, s11
 ; REMAT-NEXT:    vse32.v v8, (s11)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s10, a1, s10
 ; REMAT-NEXT:    vse32.v v8, (s10)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s9, a1, s9
 ; REMAT-NEXT:    vse32.v v8, (s9)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s8, a1, s8
 ; REMAT-NEXT:    vse32.v v8, (s8)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s7, a1, s7
 ; REMAT-NEXT:    vse32.v v8, (s7)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s6, a1, s6
 ; REMAT-NEXT:    vse32.v v8, (s6)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s5, a1, s5
 ; REMAT-NEXT:    vse32.v v8, (s5)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s4, a1, s4
 ; REMAT-NEXT:    vse32.v v8, (s4)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s3, a1, s3
 ; REMAT-NEXT:    vse32.v v8, (s3)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s2, a1, s2
 ; REMAT-NEXT:    vse32.v v8, (s2)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s1, a1, s1
 ; REMAT-NEXT:    vse32.v v8, (s1)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add s0, a1, s0
 ; REMAT-NEXT:    vse32.v v8, (s0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add t6, a1, t6
 ; REMAT-NEXT:    vse32.v v8, (t6)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add t5, a1, t5
 ; REMAT-NEXT:    vse32.v v8, (t5)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add t4, a1, t4
 ; REMAT-NEXT:    vse32.v v8, (t4)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add t3, a1, t3
 ; REMAT-NEXT:    vse32.v v8, (t3)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add t2, a1, t2
 ; REMAT-NEXT:    vse32.v v8, (t2)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add t1, a1, t1
 ; REMAT-NEXT:    vse32.v v8, (t1)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add t0, a1, t0
 ; REMAT-NEXT:    vse32.v v8, (t0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add a7, a1, a7
 ; REMAT-NEXT:    vse32.v v8, (a7)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add a6, a1, a6
 ; REMAT-NEXT:    vse32.v v8, (a6)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add a5, a1, a5
 ; REMAT-NEXT:    vse32.v v8, (a5)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add a4, a1, a4
 ; REMAT-NEXT:    vse32.v v8, (a4)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add a3, a1, a3
 ; REMAT-NEXT:    vse32.v v8, (a3)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    add a2, a1, a2
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a2)
-; REMAT-NEXT:    lui a0, 8
-; REMAT-NEXT:    addiw a0, a0, 512
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
-; REMAT-NEXT:    lui a0, 8
-; REMAT-NEXT:    addiw a0, a0, 1024
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    lui a0, 8
-; REMAT-NEXT:    addiw a0, a0, 1536
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
-; REMAT-NEXT:    li a0, 17
-; REMAT-NEXT:    slli a0, a0, 11
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    lui a0, 9
-; REMAT-NEXT:    addiw a0, a0, -1536
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
-; REMAT-NEXT:    lui a0, 9
-; REMAT-NEXT:    addiw a0, a0, -1024
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    lui a0, 9
-; REMAT-NEXT:    addiw a0, a0, -512
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
-; REMAT-NEXT:    lui a0, 9
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    lui a0, 9
-; REMAT-NEXT:    addiw a0, a0, 512
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
-; REMAT-NEXT:    lui a0, 9
-; REMAT-NEXT:    addiw a0, a0, 1024
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    lui a0, 9
-; REMAT-NEXT:    addiw a0, a0, 1536
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
-; REMAT-NEXT:    li a0, 19
-; REMAT-NEXT:    slli a0, a0, 11
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    lui a0, 10
-; REMAT-NEXT:    addiw a0, a0, -1536
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
-; REMAT-NEXT:    lui a0, 10
-; REMAT-NEXT:    addiw a0, a0, -1024
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    lui a0, 10
-; REMAT-NEXT:    addiw a0, a0, -512
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
-; REMAT-NEXT:    lui a0, 10
-; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    vse32.v v8, (a0)
-; REMAT-NEXT:    lui a0, 10
-; REMAT-NEXT:    addiw a0, a0, 512
-; REMAT-NEXT:    add a0, a1, a0
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    ld ra, 104(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s0, 96(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s1, 88(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s2, 80(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s3, 72(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s4, 64(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s5, 56(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s6, 48(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s7, 40(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s8, 32(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s9, 24(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s10, 16(sp) # 8-byte Folded Reload
-; REMAT-NEXT:    ld s11, 8(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    csrr a0, vlenb
+; REMAT-NEXT:    li a1, 18
+; REMAT-NEXT:    mul a0, a0, a1
+; REMAT-NEXT:    add sp, sp, a0
+; REMAT-NEXT:    .cfi_def_cfa sp, 544
+; REMAT-NEXT:    ld ra, 536(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s0, 528(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s1, 520(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s2, 512(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s3, 504(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s4, 496(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s5, 488(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s6, 480(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s7, 472(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s8, 464(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s9, 456(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s10, 448(sp) # 8-byte Folded Reload
+; REMAT-NEXT:    ld s11, 440(sp) # 8-byte Folded Reload
 ; REMAT-NEXT:    .cfi_restore ra
 ; REMAT-NEXT:    .cfi_restore s0
 ; REMAT-NEXT:    .cfi_restore s1
@@ -1561,7 +1909,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    .cfi_restore s9
 ; REMAT-NEXT:    .cfi_restore s10
 ; REMAT-NEXT:    .cfi_restore s11
-; REMAT-NEXT:    addi sp, sp, 112
+; REMAT-NEXT:    addi sp, sp, 544
 ; REMAT-NEXT:    .cfi_def_cfa_offset 0
 ; REMAT-NEXT:    ret
   %4 = tail call i64 @llvm.riscv.vsetvli.i64(i64 32, i64 2, i64 1)

diff  --git a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
index f9b9c8a69d431c..b1bba5fdc92116 100644
--- a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
+++ b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
@@ -8,13 +8,13 @@ define i1 @pr84653(i32 %x) {
 ; CHECK-NOZBB-LABEL: pr84653:
 ; CHECK-NOZBB:       # %bb.0:
 ; CHECK-NOZBB-NEXT:    sext.w a1, a0
-; CHECK-NOZBB-NEXT:    sgtz a2, a1
-; CHECK-NOZBB-NEXT:    lui a3, 524288
-; CHECK-NOZBB-NEXT:    addi a3, a3, -1
-; CHECK-NOZBB-NEXT:    xor a0, a0, a3
+; CHECK-NOZBB-NEXT:    lui a2, 524288
+; CHECK-NOZBB-NEXT:    sgtz a3, a1
+; CHECK-NOZBB-NEXT:    addi a2, a2, -1
+; CHECK-NOZBB-NEXT:    xor a0, a0, a2
 ; CHECK-NOZBB-NEXT:    sext.w a0, a0
 ; CHECK-NOZBB-NEXT:    slt a0, a0, a1
-; CHECK-NOZBB-NEXT:    and a0, a2, a0
+; CHECK-NOZBB-NEXT:    and a0, a3, a0
 ; CHECK-NOZBB-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: pr84653:
@@ -69,13 +69,13 @@ define i1 @select_to_or(i32 %x) {
 ; CHECK-NOZBB-LABEL: select_to_or:
 ; CHECK-NOZBB:       # %bb.0:
 ; CHECK-NOZBB-NEXT:    sext.w a1, a0
-; CHECK-NOZBB-NEXT:    sgtz a2, a1
-; CHECK-NOZBB-NEXT:    lui a3, 524288
-; CHECK-NOZBB-NEXT:    addi a3, a3, -1
-; CHECK-NOZBB-NEXT:    xor a0, a0, a3
+; CHECK-NOZBB-NEXT:    lui a2, 524288
+; CHECK-NOZBB-NEXT:    sgtz a3, a1
+; CHECK-NOZBB-NEXT:    addi a2, a2, -1
+; CHECK-NOZBB-NEXT:    xor a0, a0, a2
 ; CHECK-NOZBB-NEXT:    sext.w a0, a0
 ; CHECK-NOZBB-NEXT:    slt a0, a0, a1
-; CHECK-NOZBB-NEXT:    or a0, a2, a0
+; CHECK-NOZBB-NEXT:    or a0, a3, a0
 ; CHECK-NOZBB-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: select_to_or:

diff  --git a/llvm/test/CodeGen/RISCV/pr95271.ll b/llvm/test/CodeGen/RISCV/pr95271.ll
index 950e6fb5f37ec2..aa941cb8036276 100644
--- a/llvm/test/CodeGen/RISCV/pr95271.ll
+++ b/llvm/test/CodeGen/RISCV/pr95271.ll
@@ -6,22 +6,22 @@ define i32 @PR95271(ptr %p) {
 ; RV32I-LABEL: PR95271:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lui a1, 349525
+; RV32I-NEXT:    addi a1, a1, 1365
 ; RV32I-NEXT:    addi a0, a0, 1
-; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    addi a1, a2, -241
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
@@ -33,23 +33,23 @@ define i32 @PR95271(ptr %p) {
 ; RV64I-LABEL: PR95271:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lw a0, 0(a0)
-; RV64I-NEXT:    addiw a1, a0, 1
-; RV64I-NEXT:    addi a0, a0, 1
-; RV64I-NEXT:    srli a0, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addi a2, a0, 1
+; RV64I-NEXT:    srli a2, a2, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a0, a0, 1
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
+; RV64I-NEXT:    srli a0, a0, 2
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    sub a1, a1, a0
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    and a2, a1, a0
-; RV64I-NEXT:    srli a1, a1, 2
-; RV64I-NEXT:    and a0, a1, a0
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1

diff  --git a/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll b/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
index dd270fa12183ed..fb0c11e930b3b6 100644
--- a/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
+++ b/llvm/test/CodeGen/RISCV/regalloc-last-chance-recoloring-failure.ll
@@ -24,6 +24,8 @@ define void @last_chance_recoloring_failure() {
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 16 * vlenb
 ; CHECK-NEXT:    li a0, 55
+; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmclr.m v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vloxseg2ei32.v v16, (a1), v8
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -35,8 +37,6 @@ define void @last_chance_recoloring_failure() {
 ; CHECK-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vs4r.v v20, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmclr.m v0
 ; CHECK-NEXT:    li s0, 36
 ; CHECK-NEXT:    vsetvli zero, s0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwadd.vv v16, v8, v12, v0.t
@@ -84,6 +84,8 @@ define void @last_chance_recoloring_failure() {
 ; SUBREGLIVENESS-NEXT:    sub sp, sp, a0
 ; SUBREGLIVENESS-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 16 * vlenb
 ; SUBREGLIVENESS-NEXT:    li a0, 55
+; SUBREGLIVENESS-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; SUBREGLIVENESS-NEXT:    vmclr.m v0
 ; SUBREGLIVENESS-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; SUBREGLIVENESS-NEXT:    vloxseg2ei32.v v16, (a1), v8
 ; SUBREGLIVENESS-NEXT:    csrr a0, vlenb
@@ -95,8 +97,6 @@ define void @last_chance_recoloring_failure() {
 ; SUBREGLIVENESS-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
 ; SUBREGLIVENESS-NEXT:    add a0, a0, a1
 ; SUBREGLIVENESS-NEXT:    vs4r.v v20, (a0) # Unknown-size Folded Spill
-; SUBREGLIVENESS-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; SUBREGLIVENESS-NEXT:    vmclr.m v0
 ; SUBREGLIVENESS-NEXT:    li s0, 36
 ; SUBREGLIVENESS-NEXT:    vsetvli zero, s0, e16, m4, ta, ma
 ; SUBREGLIVENESS-NEXT:    vfwadd.vv v16, v8, v12, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rem.ll b/llvm/test/CodeGen/RISCV/rem.ll
index 5b27c4129df6ad..2001262008237f 100644
--- a/llvm/test/CodeGen/RISCV/rem.ll
+++ b/llvm/test/CodeGen/RISCV/rem.ll
@@ -23,8 +23,8 @@ define i32 @urem(i32 %a, i32 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    call __umoddi3
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -452,8 +452,8 @@ define i8 @srem8(i8 %a, i8 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -463,8 +463,8 @@ define i8 @srem8(i8 %a, i8 %b) nounwind {
 ; RV32IM-LABEL: srem8:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a1, a1, 24
-; RV32IM-NEXT:    srai a1, a1, 24
 ; RV32IM-NEXT:    slli a0, a0, 24
+; RV32IM-NEXT:    srai a1, a1, 24
 ; RV32IM-NEXT:    srai a0, a0, 24
 ; RV32IM-NEXT:    rem a0, a0, a1
 ; RV32IM-NEXT:    ret
@@ -474,8 +474,8 @@ define i8 @srem8(i8 %a, i8 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    slli a1, a1, 56
+; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -485,8 +485,8 @@ define i8 @srem8(i8 %a, i8 %b) nounwind {
 ; RV64IM-LABEL: srem8:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a1, 56
-; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    slli a0, a0, 56
+; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    srai a0, a0, 56
 ; RV64IM-NEXT:    remw a0, a0, a1
 ; RV64IM-NEXT:    ret
@@ -637,8 +637,8 @@ define i16 @srem16(i16 %a, i16 %b) nounwind {
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    call __modsi3
 ; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -648,8 +648,8 @@ define i16 @srem16(i16 %a, i16 %b) nounwind {
 ; RV32IM-LABEL: srem16:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slli a1, a1, 16
-; RV32IM-NEXT:    srai a1, a1, 16
 ; RV32IM-NEXT:    slli a0, a0, 16
+; RV32IM-NEXT:    srai a1, a1, 16
 ; RV32IM-NEXT:    srai a0, a0, 16
 ; RV32IM-NEXT:    rem a0, a0, a1
 ; RV32IM-NEXT:    ret
@@ -659,8 +659,8 @@ define i16 @srem16(i16 %a, i16 %b) nounwind {
 ; RV64I-NEXT:    addi sp, sp, -16
 ; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    slli a1, a1, 48
+; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    call __moddi3
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -670,8 +670,8 @@ define i16 @srem16(i16 %a, i16 %b) nounwind {
 ; RV64IM-LABEL: srem16:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a1, 48
-; RV64IM-NEXT:    srai a1, a1, 48
 ; RV64IM-NEXT:    slli a0, a0, 48
+; RV64IM-NEXT:    srai a1, a1, 48
 ; RV64IM-NEXT:    srai a0, a0, 48
 ; RV64IM-NEXT:    remw a0, a0, a1
 ; RV64IM-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
index 352184c2d85ada..32261ee47164e5 100644
--- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
+++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll
@@ -64,11 +64,11 @@ define void @test2(ptr nocapture noundef %a, i32 noundef signext %n) {
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    lw a5, -4(a4)
 ; CHECK-NEXT:    lw a6, 0(a4)
+; CHECK-NEXT:    addi a3, a3, 2
 ; CHECK-NEXT:    addi a5, a5, 4
 ; CHECK-NEXT:    addi a6, a6, 4
 ; CHECK-NEXT:    sw a5, -4(a4)
 ; CHECK-NEXT:    sw a6, 0(a4)
-; CHECK-NEXT:    addi a3, a3, 2
 ; CHECK-NEXT:    addi a4, a4, 8
 ; CHECK-NEXT:    bne a1, a3, .LBB1_4
 ; CHECK-NEXT:  .LBB1_5: # %for.cond.cleanup.loopexit.unr-lcssa

diff  --git a/llvm/test/CodeGen/RISCV/riscv-shifted-extend.ll b/llvm/test/CodeGen/RISCV/riscv-shifted-extend.ll
index 4901e268ec11a0..c1e7b682200eb1 100644
--- a/llvm/test/CodeGen/RISCV/riscv-shifted-extend.ll
+++ b/llvm/test/CodeGen/RISCV/riscv-shifted-extend.ll
@@ -6,13 +6,13 @@ define void @test(ptr nocapture noundef writeonly %array1, i32 noundef signext %
 ; RV64-LABEL: test:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    addiw a3, a1, 5
-; RV64-NEXT:    slli a4, a3, 2
-; RV64-NEXT:    add a4, a0, a4
 ; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    sw a2, 0(a4)
-; RV64-NEXT:    sw a2, 24(a0)
-; RV64-NEXT:    sw a3, 140(a0)
+; RV64-NEXT:    slli a4, a3, 2
+; RV64-NEXT:    add a1, a1, a0
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    sw a2, 0(a0)
+; RV64-NEXT:    sw a2, 24(a1)
+; RV64-NEXT:    sw a3, 140(a1)
 ; RV64-NEXT:    ret
 entry:
   %add = add nsw i32 %a, 5
@@ -70,13 +70,13 @@ define void @test2(ptr nocapture noundef writeonly %array1, i64 noundef %a, i64
 ; RV64-LABEL: test2:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    addi a3, a1, 5
-; RV64-NEXT:    slli a4, a3, 3
-; RV64-NEXT:    add a4, a0, a4
 ; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    sd a2, 0(a4)
-; RV64-NEXT:    sd a2, 48(a0)
-; RV64-NEXT:    sd a3, 280(a0)
+; RV64-NEXT:    slli a4, a3, 3
+; RV64-NEXT:    add a1, a1, a0
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    sd a2, 0(a0)
+; RV64-NEXT:    sd a2, 48(a1)
+; RV64-NEXT:    sd a3, 280(a1)
 ; RV64-NEXT:    ret
 entry:
   %add = add nsw i64 %a, 5
@@ -101,8 +101,8 @@ define void @test3(ptr nocapture noundef %array1, i64 noundef %a, i64 noundef %b
 ; RV64-NEXT:    mv a5, a2
 ; RV64-NEXT:  .LBB3_2: # %entry
 ; RV64-NEXT:    slli a2, a4, 3
-; RV64-NEXT:    add a2, a0, a2
 ; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a2, a0, a2
 ; RV64-NEXT:    add a0, a1, a0
 ; RV64-NEXT:    sd a5, 0(a2)
 ; RV64-NEXT:    sd a5, 48(a0)

diff  --git a/llvm/test/CodeGen/RISCV/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/rotl-rotr.ll
index d907a37c2b3d17..634cca5dcdb71b 100644
--- a/llvm/test/CodeGen/RISCV/rotl-rotr.ll
+++ b/llvm/test/CodeGen/RISCV/rotl-rotr.ll
@@ -133,10 +133,10 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
 ; RV32I-NEXT:    or a3, a3, a6
 ; RV32I-NEXT:  .LBB2_3:
 ; RV32I-NEXT:    srai a5, a5, 31
-; RV32I-NEXT:    and a4, a5, a4
 ; RV32I-NEXT:    neg a7, a2
-; RV32I-NEXT:    li a5, 32
-; RV32I-NEXT:    sub a5, a5, a2
+; RV32I-NEXT:    li a6, 32
+; RV32I-NEXT:    and a4, a5, a4
+; RV32I-NEXT:    sub a5, a6, a2
 ; RV32I-NEXT:    srl a6, a1, a7
 ; RV32I-NEXT:    bltz a5, .LBB2_5
 ; RV32I-NEXT:  # %bb.4:
@@ -181,10 +181,10 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
 ; RV32ZBB-NEXT:    or a3, a3, a6
 ; RV32ZBB-NEXT:  .LBB2_3:
 ; RV32ZBB-NEXT:    srai a5, a5, 31
-; RV32ZBB-NEXT:    and a4, a5, a4
 ; RV32ZBB-NEXT:    neg a7, a2
-; RV32ZBB-NEXT:    li a5, 32
-; RV32ZBB-NEXT:    sub a5, a5, a2
+; RV32ZBB-NEXT:    li a6, 32
+; RV32ZBB-NEXT:    and a4, a5, a4
+; RV32ZBB-NEXT:    sub a5, a6, a2
 ; RV32ZBB-NEXT:    srl a6, a1, a7
 ; RV32ZBB-NEXT:    bltz a5, .LBB2_5
 ; RV32ZBB-NEXT:  # %bb.4:
@@ -226,10 +226,10 @@ define i64 @rotl_64(i64 %x, i64 %y) nounwind {
 ; RV32XTHEADBB-NEXT:    or a3, a3, a6
 ; RV32XTHEADBB-NEXT:  .LBB2_3:
 ; RV32XTHEADBB-NEXT:    srai a5, a5, 31
-; RV32XTHEADBB-NEXT:    and a4, a5, a4
 ; RV32XTHEADBB-NEXT:    neg a7, a2
-; RV32XTHEADBB-NEXT:    li a5, 32
-; RV32XTHEADBB-NEXT:    sub a5, a5, a2
+; RV32XTHEADBB-NEXT:    li a6, 32
+; RV32XTHEADBB-NEXT:    and a4, a5, a4
+; RV32XTHEADBB-NEXT:    sub a5, a6, a2
 ; RV32XTHEADBB-NEXT:    srl a6, a1, a7
 ; RV32XTHEADBB-NEXT:    bltz a5, .LBB2_5
 ; RV32XTHEADBB-NEXT:  # %bb.4:
@@ -281,10 +281,10 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
 ; RV32I-NEXT:    or a3, a3, a6
 ; RV32I-NEXT:  .LBB3_3:
 ; RV32I-NEXT:    srai a5, a5, 31
-; RV32I-NEXT:    and a4, a5, a4
 ; RV32I-NEXT:    neg a7, a2
-; RV32I-NEXT:    li a5, 32
-; RV32I-NEXT:    sub a5, a5, a2
+; RV32I-NEXT:    li a6, 32
+; RV32I-NEXT:    and a4, a5, a4
+; RV32I-NEXT:    sub a5, a6, a2
 ; RV32I-NEXT:    sll a6, a0, a7
 ; RV32I-NEXT:    bltz a5, .LBB3_5
 ; RV32I-NEXT:  # %bb.4:
@@ -329,10 +329,10 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
 ; RV32ZBB-NEXT:    or a3, a3, a6
 ; RV32ZBB-NEXT:  .LBB3_3:
 ; RV32ZBB-NEXT:    srai a5, a5, 31
-; RV32ZBB-NEXT:    and a4, a5, a4
 ; RV32ZBB-NEXT:    neg a7, a2
-; RV32ZBB-NEXT:    li a5, 32
-; RV32ZBB-NEXT:    sub a5, a5, a2
+; RV32ZBB-NEXT:    li a6, 32
+; RV32ZBB-NEXT:    and a4, a5, a4
+; RV32ZBB-NEXT:    sub a5, a6, a2
 ; RV32ZBB-NEXT:    sll a6, a0, a7
 ; RV32ZBB-NEXT:    bltz a5, .LBB3_5
 ; RV32ZBB-NEXT:  # %bb.4:
@@ -374,10 +374,10 @@ define i64 @rotr_64(i64 %x, i64 %y) nounwind {
 ; RV32XTHEADBB-NEXT:    or a3, a3, a6
 ; RV32XTHEADBB-NEXT:  .LBB3_3:
 ; RV32XTHEADBB-NEXT:    srai a5, a5, 31
-; RV32XTHEADBB-NEXT:    and a4, a5, a4
 ; RV32XTHEADBB-NEXT:    neg a7, a2
-; RV32XTHEADBB-NEXT:    li a5, 32
-; RV32XTHEADBB-NEXT:    sub a5, a5, a2
+; RV32XTHEADBB-NEXT:    li a6, 32
+; RV32XTHEADBB-NEXT:    and a4, a5, a4
+; RV32XTHEADBB-NEXT:    sub a5, a6, a2
 ; RV32XTHEADBB-NEXT:    sll a6, a0, a7
 ; RV32XTHEADBB-NEXT:    bltz a5, .LBB3_5
 ; RV32XTHEADBB-NEXT:  # %bb.4:
@@ -1442,45 +1442,45 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV32I-LABEL: rotl_64_mask_shared:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a5, a4, 26
-; RV32I-NEXT:    srli a5, a5, 31
-; RV32I-NEXT:    mv a7, a0
-; RV32I-NEXT:    bnez a5, .LBB17_2
+; RV32I-NEXT:    srli t0, a5, 31
+; RV32I-NEXT:    mv a6, a0
+; RV32I-NEXT:    bnez t0, .LBB17_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a7, a1
+; RV32I-NEXT:    mv a6, a1
 ; RV32I-NEXT:  .LBB17_2:
-; RV32I-NEXT:    andi a6, a4, 63
-; RV32I-NEXT:    sll t0, a7, a4
-; RV32I-NEXT:    bnez a5, .LBB17_4
+; RV32I-NEXT:    andi a5, a4, 63
+; RV32I-NEXT:    sll a7, a6, a4
+; RV32I-NEXT:    bnez t0, .LBB17_4
 ; RV32I-NEXT:  # %bb.3:
 ; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:  .LBB17_4:
 ; RV32I-NEXT:    srli a0, a1, 1
-; RV32I-NEXT:    not t1, a4
-; RV32I-NEXT:    srl a0, a0, t1
-; RV32I-NEXT:    or a5, t0, a0
-; RV32I-NEXT:    sll a1, a1, a4
-; RV32I-NEXT:    srli a0, a7, 1
-; RV32I-NEXT:    srl a7, a0, t1
-; RV32I-NEXT:    addi a0, a6, -32
-; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    not t0, a4
+; RV32I-NEXT:    sll t1, a1, a4
+; RV32I-NEXT:    srli a1, a6, 1
+; RV32I-NEXT:    srl a6, a0, t0
+; RV32I-NEXT:    srl t0, a1, t0
+; RV32I-NEXT:    addi a0, a5, -32
+; RV32I-NEXT:    or a1, a7, a6
+; RV32I-NEXT:    or a6, t1, t0
 ; RV32I-NEXT:    bltz a0, .LBB17_6
 ; RV32I-NEXT:  # %bb.5:
-; RV32I-NEXT:    sll a3, a2, a6
+; RV32I-NEXT:    sll a3, a2, a5
 ; RV32I-NEXT:    j .LBB17_7
 ; RV32I-NEXT:  .LBB17_6:
 ; RV32I-NEXT:    sll a3, a3, a4
 ; RV32I-NEXT:    srli a7, a2, 1
-; RV32I-NEXT:    not a6, a6
-; RV32I-NEXT:    srl a6, a7, a6
-; RV32I-NEXT:    or a3, a3, a6
+; RV32I-NEXT:    not a5, a5
+; RV32I-NEXT:    srl a5, a7, a5
+; RV32I-NEXT:    or a3, a3, a5
 ; RV32I-NEXT:  .LBB17_7:
 ; RV32I-NEXT:    sll a2, a2, a4
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    add a0, a1, a0
-; RV32I-NEXT:    sltu a1, a0, a1
-; RV32I-NEXT:    add a3, a5, a3
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    add a0, a6, a0
+; RV32I-NEXT:    sltu a2, a0, a6
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: rotl_64_mask_shared:
@@ -1496,45 +1496,45 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ; RV32ZBB-LABEL: rotl_64_mask_shared:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    slli a5, a4, 26
-; RV32ZBB-NEXT:    srli a5, a5, 31
-; RV32ZBB-NEXT:    mv a7, a0
-; RV32ZBB-NEXT:    bnez a5, .LBB17_2
+; RV32ZBB-NEXT:    srli t0, a5, 31
+; RV32ZBB-NEXT:    mv a6, a0
+; RV32ZBB-NEXT:    bnez t0, .LBB17_2
 ; RV32ZBB-NEXT:  # %bb.1:
-; RV32ZBB-NEXT:    mv a7, a1
+; RV32ZBB-NEXT:    mv a6, a1
 ; RV32ZBB-NEXT:  .LBB17_2:
-; RV32ZBB-NEXT:    andi a6, a4, 63
-; RV32ZBB-NEXT:    sll t0, a7, a4
-; RV32ZBB-NEXT:    bnez a5, .LBB17_4
+; RV32ZBB-NEXT:    andi a5, a4, 63
+; RV32ZBB-NEXT:    sll a7, a6, a4
+; RV32ZBB-NEXT:    bnez t0, .LBB17_4
 ; RV32ZBB-NEXT:  # %bb.3:
 ; RV32ZBB-NEXT:    mv a1, a0
 ; RV32ZBB-NEXT:  .LBB17_4:
 ; RV32ZBB-NEXT:    srli a0, a1, 1
-; RV32ZBB-NEXT:    not t1, a4
-; RV32ZBB-NEXT:    srl a0, a0, t1
-; RV32ZBB-NEXT:    or a5, t0, a0
-; RV32ZBB-NEXT:    sll a1, a1, a4
-; RV32ZBB-NEXT:    srli a0, a7, 1
-; RV32ZBB-NEXT:    srl a7, a0, t1
-; RV32ZBB-NEXT:    addi a0, a6, -32
-; RV32ZBB-NEXT:    or a1, a1, a7
+; RV32ZBB-NEXT:    not t0, a4
+; RV32ZBB-NEXT:    sll t1, a1, a4
+; RV32ZBB-NEXT:    srli a1, a6, 1
+; RV32ZBB-NEXT:    srl a6, a0, t0
+; RV32ZBB-NEXT:    srl t0, a1, t0
+; RV32ZBB-NEXT:    addi a0, a5, -32
+; RV32ZBB-NEXT:    or a1, a7, a6
+; RV32ZBB-NEXT:    or a6, t1, t0
 ; RV32ZBB-NEXT:    bltz a0, .LBB17_6
 ; RV32ZBB-NEXT:  # %bb.5:
-; RV32ZBB-NEXT:    sll a3, a2, a6
+; RV32ZBB-NEXT:    sll a3, a2, a5
 ; RV32ZBB-NEXT:    j .LBB17_7
 ; RV32ZBB-NEXT:  .LBB17_6:
 ; RV32ZBB-NEXT:    sll a3, a3, a4
 ; RV32ZBB-NEXT:    srli a7, a2, 1
-; RV32ZBB-NEXT:    not a6, a6
-; RV32ZBB-NEXT:    srl a6, a7, a6
-; RV32ZBB-NEXT:    or a3, a3, a6
+; RV32ZBB-NEXT:    not a5, a5
+; RV32ZBB-NEXT:    srl a5, a7, a5
+; RV32ZBB-NEXT:    or a3, a3, a5
 ; RV32ZBB-NEXT:  .LBB17_7:
 ; RV32ZBB-NEXT:    sll a2, a2, a4
 ; RV32ZBB-NEXT:    srai a0, a0, 31
 ; RV32ZBB-NEXT:    and a0, a0, a2
-; RV32ZBB-NEXT:    add a0, a1, a0
-; RV32ZBB-NEXT:    sltu a1, a0, a1
-; RV32ZBB-NEXT:    add a3, a5, a3
-; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    add a0, a6, a0
+; RV32ZBB-NEXT:    sltu a2, a0, a6
+; RV32ZBB-NEXT:    add a1, a1, a3
+; RV32ZBB-NEXT:    add a1, a1, a2
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: rotl_64_mask_shared:
@@ -1546,45 +1546,45 @@ define signext i64 @rotl_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ;
 ; RV32XTHEADBB-LABEL: rotl_64_mask_shared:
 ; RV32XTHEADBB:       # %bb.0:
-; RV32XTHEADBB-NEXT:    th.extu a5, a4, 5, 5
-; RV32XTHEADBB-NEXT:    mv a7, a0
-; RV32XTHEADBB-NEXT:    bnez a5, .LBB17_2
+; RV32XTHEADBB-NEXT:    th.extu t0, a4, 5, 5
+; RV32XTHEADBB-NEXT:    mv a6, a0
+; RV32XTHEADBB-NEXT:    bnez t0, .LBB17_2
 ; RV32XTHEADBB-NEXT:  # %bb.1:
-; RV32XTHEADBB-NEXT:    mv a7, a1
+; RV32XTHEADBB-NEXT:    mv a6, a1
 ; RV32XTHEADBB-NEXT:  .LBB17_2:
-; RV32XTHEADBB-NEXT:    andi a6, a4, 63
-; RV32XTHEADBB-NEXT:    sll t0, a7, a4
-; RV32XTHEADBB-NEXT:    bnez a5, .LBB17_4
+; RV32XTHEADBB-NEXT:    andi a5, a4, 63
+; RV32XTHEADBB-NEXT:    sll a7, a6, a4
+; RV32XTHEADBB-NEXT:    bnez t0, .LBB17_4
 ; RV32XTHEADBB-NEXT:  # %bb.3:
 ; RV32XTHEADBB-NEXT:    mv a1, a0
 ; RV32XTHEADBB-NEXT:  .LBB17_4:
 ; RV32XTHEADBB-NEXT:    srli a0, a1, 1
-; RV32XTHEADBB-NEXT:    not t1, a4
-; RV32XTHEADBB-NEXT:    srl a0, a0, t1
-; RV32XTHEADBB-NEXT:    or a5, t0, a0
-; RV32XTHEADBB-NEXT:    sll a1, a1, a4
-; RV32XTHEADBB-NEXT:    srli a0, a7, 1
-; RV32XTHEADBB-NEXT:    srl a7, a0, t1
-; RV32XTHEADBB-NEXT:    addi a0, a6, -32
-; RV32XTHEADBB-NEXT:    or a1, a1, a7
+; RV32XTHEADBB-NEXT:    not t0, a4
+; RV32XTHEADBB-NEXT:    sll t1, a1, a4
+; RV32XTHEADBB-NEXT:    srli a1, a6, 1
+; RV32XTHEADBB-NEXT:    srl a6, a0, t0
+; RV32XTHEADBB-NEXT:    srl t0, a1, t0
+; RV32XTHEADBB-NEXT:    addi a0, a5, -32
+; RV32XTHEADBB-NEXT:    or a1, a7, a6
+; RV32XTHEADBB-NEXT:    or a6, t1, t0
 ; RV32XTHEADBB-NEXT:    bltz a0, .LBB17_6
 ; RV32XTHEADBB-NEXT:  # %bb.5:
-; RV32XTHEADBB-NEXT:    sll a3, a2, a6
+; RV32XTHEADBB-NEXT:    sll a3, a2, a5
 ; RV32XTHEADBB-NEXT:    j .LBB17_7
 ; RV32XTHEADBB-NEXT:  .LBB17_6:
 ; RV32XTHEADBB-NEXT:    sll a3, a3, a4
 ; RV32XTHEADBB-NEXT:    srli a7, a2, 1
-; RV32XTHEADBB-NEXT:    not a6, a6
-; RV32XTHEADBB-NEXT:    srl a6, a7, a6
-; RV32XTHEADBB-NEXT:    or a3, a3, a6
+; RV32XTHEADBB-NEXT:    not a5, a5
+; RV32XTHEADBB-NEXT:    srl a5, a7, a5
+; RV32XTHEADBB-NEXT:    or a3, a3, a5
 ; RV32XTHEADBB-NEXT:  .LBB17_7:
 ; RV32XTHEADBB-NEXT:    sll a2, a2, a4
 ; RV32XTHEADBB-NEXT:    srai a0, a0, 31
 ; RV32XTHEADBB-NEXT:    and a0, a0, a2
-; RV32XTHEADBB-NEXT:    add a0, a1, a0
-; RV32XTHEADBB-NEXT:    sltu a1, a0, a1
-; RV32XTHEADBB-NEXT:    add a3, a5, a3
-; RV32XTHEADBB-NEXT:    add a1, a3, a1
+; RV32XTHEADBB-NEXT:    add a0, a6, a0
+; RV32XTHEADBB-NEXT:    sltu a2, a0, a6
+; RV32XTHEADBB-NEXT:    add a1, a1, a3
+; RV32XTHEADBB-NEXT:    add a1, a1, a2
 ; RV32XTHEADBB-NEXT:    ret
 ;
 ; RV64XTHEADBB-LABEL: rotl_64_mask_shared:
@@ -1669,27 +1669,27 @@ declare i32 @llvm.fshr.i32(i32, i32, i32)
 define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 signext %amt) nounwind {
 ; RV32I-LABEL: rotr_64_mask_shared:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    andi a7, a4, 32
+; RV32I-NEXT:    andi t0, a4, 32
 ; RV32I-NEXT:    mv a6, a1
-; RV32I-NEXT:    beqz a7, .LBB19_2
+; RV32I-NEXT:    beqz t0, .LBB19_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a6, a0
 ; RV32I-NEXT:  .LBB19_2:
 ; RV32I-NEXT:    andi a5, a4, 63
-; RV32I-NEXT:    srl t0, a6, a4
-; RV32I-NEXT:    beqz a7, .LBB19_4
+; RV32I-NEXT:    srl a7, a6, a4
+; RV32I-NEXT:    beqz t0, .LBB19_4
 ; RV32I-NEXT:  # %bb.3:
 ; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:  .LBB19_4:
 ; RV32I-NEXT:    slli a1, a0, 1
-; RV32I-NEXT:    not a7, a4
-; RV32I-NEXT:    sll a1, a1, a7
-; RV32I-NEXT:    or a1, a1, t0
-; RV32I-NEXT:    srl t0, a0, a4
+; RV32I-NEXT:    not t0, a4
+; RV32I-NEXT:    srl t1, a0, a4
 ; RV32I-NEXT:    slli a6, a6, 1
-; RV32I-NEXT:    sll a6, a6, a7
+; RV32I-NEXT:    sll a1, a1, t0
+; RV32I-NEXT:    sll a6, a6, t0
 ; RV32I-NEXT:    addi a0, a5, -32
-; RV32I-NEXT:    or a6, a6, t0
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a6, a6, t1
 ; RV32I-NEXT:    bltz a0, .LBB19_6
 ; RV32I-NEXT:  # %bb.5:
 ; RV32I-NEXT:    sll a3, a2, a5
@@ -1722,27 +1722,27 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ;
 ; RV32ZBB-LABEL: rotr_64_mask_shared:
 ; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    andi a7, a4, 32
+; RV32ZBB-NEXT:    andi t0, a4, 32
 ; RV32ZBB-NEXT:    mv a6, a1
-; RV32ZBB-NEXT:    beqz a7, .LBB19_2
+; RV32ZBB-NEXT:    beqz t0, .LBB19_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    mv a6, a0
 ; RV32ZBB-NEXT:  .LBB19_2:
 ; RV32ZBB-NEXT:    andi a5, a4, 63
-; RV32ZBB-NEXT:    srl t0, a6, a4
-; RV32ZBB-NEXT:    beqz a7, .LBB19_4
+; RV32ZBB-NEXT:    srl a7, a6, a4
+; RV32ZBB-NEXT:    beqz t0, .LBB19_4
 ; RV32ZBB-NEXT:  # %bb.3:
 ; RV32ZBB-NEXT:    mv a0, a1
 ; RV32ZBB-NEXT:  .LBB19_4:
 ; RV32ZBB-NEXT:    slli a1, a0, 1
-; RV32ZBB-NEXT:    not a7, a4
-; RV32ZBB-NEXT:    sll a1, a1, a7
-; RV32ZBB-NEXT:    or a1, a1, t0
-; RV32ZBB-NEXT:    srl t0, a0, a4
+; RV32ZBB-NEXT:    not t0, a4
+; RV32ZBB-NEXT:    srl t1, a0, a4
 ; RV32ZBB-NEXT:    slli a6, a6, 1
-; RV32ZBB-NEXT:    sll a6, a6, a7
+; RV32ZBB-NEXT:    sll a1, a1, t0
+; RV32ZBB-NEXT:    sll a6, a6, t0
 ; RV32ZBB-NEXT:    addi a0, a5, -32
-; RV32ZBB-NEXT:    or a6, a6, t0
+; RV32ZBB-NEXT:    or a1, a1, a7
+; RV32ZBB-NEXT:    or a6, a6, t1
 ; RV32ZBB-NEXT:    bltz a0, .LBB19_6
 ; RV32ZBB-NEXT:  # %bb.5:
 ; RV32ZBB-NEXT:    sll a3, a2, a5
@@ -1772,27 +1772,27 @@ define signext i64 @rotr_64_mask_shared(i64 signext %a, i64 signext %b, i64 sign
 ;
 ; RV32XTHEADBB-LABEL: rotr_64_mask_shared:
 ; RV32XTHEADBB:       # %bb.0:
-; RV32XTHEADBB-NEXT:    andi a7, a4, 32
+; RV32XTHEADBB-NEXT:    andi t0, a4, 32
 ; RV32XTHEADBB-NEXT:    mv a6, a1
-; RV32XTHEADBB-NEXT:    beqz a7, .LBB19_2
+; RV32XTHEADBB-NEXT:    beqz t0, .LBB19_2
 ; RV32XTHEADBB-NEXT:  # %bb.1:
 ; RV32XTHEADBB-NEXT:    mv a6, a0
 ; RV32XTHEADBB-NEXT:  .LBB19_2:
 ; RV32XTHEADBB-NEXT:    andi a5, a4, 63
-; RV32XTHEADBB-NEXT:    srl t0, a6, a4
-; RV32XTHEADBB-NEXT:    beqz a7, .LBB19_4
+; RV32XTHEADBB-NEXT:    srl a7, a6, a4
+; RV32XTHEADBB-NEXT:    beqz t0, .LBB19_4
 ; RV32XTHEADBB-NEXT:  # %bb.3:
 ; RV32XTHEADBB-NEXT:    mv a0, a1
 ; RV32XTHEADBB-NEXT:  .LBB19_4:
 ; RV32XTHEADBB-NEXT:    slli a1, a0, 1
-; RV32XTHEADBB-NEXT:    not a7, a4
-; RV32XTHEADBB-NEXT:    sll a1, a1, a7
-; RV32XTHEADBB-NEXT:    or a1, a1, t0
-; RV32XTHEADBB-NEXT:    srl t0, a0, a4
+; RV32XTHEADBB-NEXT:    not t0, a4
+; RV32XTHEADBB-NEXT:    srl t1, a0, a4
 ; RV32XTHEADBB-NEXT:    slli a6, a6, 1
-; RV32XTHEADBB-NEXT:    sll a6, a6, a7
+; RV32XTHEADBB-NEXT:    sll a1, a1, t0
+; RV32XTHEADBB-NEXT:    sll a6, a6, t0
 ; RV32XTHEADBB-NEXT:    addi a0, a5, -32
-; RV32XTHEADBB-NEXT:    or a6, a6, t0
+; RV32XTHEADBB-NEXT:    or a1, a1, a7
+; RV32XTHEADBB-NEXT:    or a6, a6, t1
 ; RV32XTHEADBB-NEXT:    bltz a0, .LBB19_6
 ; RV32XTHEADBB-NEXT:  # %bb.5:
 ; RV32XTHEADBB-NEXT:    sll a3, a2, a5
@@ -1835,10 +1835,10 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    sll a3, a0, a2
 ; RV32I-NEXT:    neg a4, a2
-; RV32I-NEXT:    srl a0, a0, a4
-; RV32I-NEXT:    or a0, a3, a0
 ; RV32I-NEXT:    sll a2, a1, a2
+; RV32I-NEXT:    srl a0, a0, a4
 ; RV32I-NEXT:    srl a1, a1, a4
+; RV32I-NEXT:    or a0, a3, a0
 ; RV32I-NEXT:    or a1, a2, a1
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -1847,10 +1847,10 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sllw a3, a0, a2
 ; RV64I-NEXT:    negw a4, a2
-; RV64I-NEXT:    srlw a0, a0, a4
-; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    sllw a2, a1, a2
+; RV64I-NEXT:    srlw a0, a0, a4
 ; RV64I-NEXT:    srlw a1, a1, a4
+; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -1873,10 +1873,10 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV32XTHEADBB:       # %bb.0:
 ; RV32XTHEADBB-NEXT:    sll a3, a0, a2
 ; RV32XTHEADBB-NEXT:    neg a4, a2
-; RV32XTHEADBB-NEXT:    srl a0, a0, a4
-; RV32XTHEADBB-NEXT:    or a0, a3, a0
 ; RV32XTHEADBB-NEXT:    sll a2, a1, a2
+; RV32XTHEADBB-NEXT:    srl a0, a0, a4
 ; RV32XTHEADBB-NEXT:    srl a1, a1, a4
+; RV32XTHEADBB-NEXT:    or a0, a3, a0
 ; RV32XTHEADBB-NEXT:    or a1, a2, a1
 ; RV32XTHEADBB-NEXT:    add a0, a0, a1
 ; RV32XTHEADBB-NEXT:    ret
@@ -1885,10 +1885,10 @@ define signext i32 @rotl_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sllw a3, a0, a2
 ; RV64XTHEADBB-NEXT:    negw a4, a2
-; RV64XTHEADBB-NEXT:    srlw a0, a0, a4
-; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    sllw a2, a1, a2
+; RV64XTHEADBB-NEXT:    srlw a0, a0, a4
 ; RV64XTHEADBB-NEXT:    srlw a1, a1, a4
+; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    or a1, a2, a1
 ; RV64XTHEADBB-NEXT:    addw a0, a0, a1
 ; RV64XTHEADBB-NEXT:    ret
@@ -1914,45 +1914,45 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:  .LBB21_4:
 ; RV32I-NEXT:    sll a7, a6, a4
-; RV32I-NEXT:    srli t0, a0, 1
+; RV32I-NEXT:    srli t1, a0, 1
 ; RV32I-NEXT:    not a1, a4
-; RV32I-NEXT:    srl t0, t0, a1
-; RV32I-NEXT:    sll t1, a0, a4
+; RV32I-NEXT:    sll t0, a0, a4
 ; RV32I-NEXT:    srli a0, a6, 1
-; RV32I-NEXT:    srl t2, a0, a1
+; RV32I-NEXT:    srl a6, t1, a1
+; RV32I-NEXT:    srl t1, a0, a1
 ; RV32I-NEXT:    mv a0, a3
 ; RV32I-NEXT:    bnez a5, .LBB21_6
 ; RV32I-NEXT:  # %bb.5:
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB21_6:
-; RV32I-NEXT:    or a6, a7, t0
-; RV32I-NEXT:    or a7, t1, t2
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    or a7, t0, t1
 ; RV32I-NEXT:    sll t0, a0, a4
 ; RV32I-NEXT:    bnez a5, .LBB21_8
 ; RV32I-NEXT:  # %bb.7:
 ; RV32I-NEXT:    mv a2, a3
 ; RV32I-NEXT:  .LBB21_8:
 ; RV32I-NEXT:    srli a3, a2, 1
-; RV32I-NEXT:    srl a3, a3, a1
-; RV32I-NEXT:    or a3, t0, a3
 ; RV32I-NEXT:    sll a2, a2, a4
 ; RV32I-NEXT:    srli a0, a0, 1
+; RV32I-NEXT:    srl a3, a3, a1
 ; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    or a1, t0, a3
 ; RV32I-NEXT:    or a0, a2, a0
-; RV32I-NEXT:    add a1, a7, a0
-; RV32I-NEXT:    add a0, a6, a3
-; RV32I-NEXT:    sltu a2, a0, a6
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    add a7, a7, a0
+; RV32I-NEXT:    add a0, a6, a1
+; RV32I-NEXT:    sltu a1, a0, a6
+; RV32I-NEXT:    add a1, a7, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: rotl_64_mask_multiple:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sll a3, a0, a2
 ; RV64I-NEXT:    negw a4, a2
-; RV64I-NEXT:    srl a0, a0, a4
-; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    sll a2, a1, a2
+; RV64I-NEXT:    srl a0, a0, a4
 ; RV64I-NEXT:    srl a1, a1, a4
+; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -1971,35 +1971,35 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32ZBB-NEXT:    mv a0, a1
 ; RV32ZBB-NEXT:  .LBB21_4:
 ; RV32ZBB-NEXT:    sll a7, a6, a4
-; RV32ZBB-NEXT:    srli t0, a0, 1
+; RV32ZBB-NEXT:    srli t1, a0, 1
 ; RV32ZBB-NEXT:    not a1, a4
-; RV32ZBB-NEXT:    srl t0, t0, a1
-; RV32ZBB-NEXT:    sll t1, a0, a4
+; RV32ZBB-NEXT:    sll t0, a0, a4
 ; RV32ZBB-NEXT:    srli a0, a6, 1
-; RV32ZBB-NEXT:    srl t2, a0, a1
+; RV32ZBB-NEXT:    srl a6, t1, a1
+; RV32ZBB-NEXT:    srl t1, a0, a1
 ; RV32ZBB-NEXT:    mv a0, a3
 ; RV32ZBB-NEXT:    bnez a5, .LBB21_6
 ; RV32ZBB-NEXT:  # %bb.5:
 ; RV32ZBB-NEXT:    mv a0, a2
 ; RV32ZBB-NEXT:  .LBB21_6:
-; RV32ZBB-NEXT:    or a6, a7, t0
-; RV32ZBB-NEXT:    or a7, t1, t2
+; RV32ZBB-NEXT:    or a6, a7, a6
+; RV32ZBB-NEXT:    or a7, t0, t1
 ; RV32ZBB-NEXT:    sll t0, a0, a4
 ; RV32ZBB-NEXT:    bnez a5, .LBB21_8
 ; RV32ZBB-NEXT:  # %bb.7:
 ; RV32ZBB-NEXT:    mv a2, a3
 ; RV32ZBB-NEXT:  .LBB21_8:
 ; RV32ZBB-NEXT:    srli a3, a2, 1
-; RV32ZBB-NEXT:    srl a3, a3, a1
-; RV32ZBB-NEXT:    or a3, t0, a3
 ; RV32ZBB-NEXT:    sll a2, a2, a4
 ; RV32ZBB-NEXT:    srli a0, a0, 1
+; RV32ZBB-NEXT:    srl a3, a3, a1
 ; RV32ZBB-NEXT:    srl a0, a0, a1
+; RV32ZBB-NEXT:    or a1, t0, a3
 ; RV32ZBB-NEXT:    or a0, a2, a0
-; RV32ZBB-NEXT:    add a1, a7, a0
-; RV32ZBB-NEXT:    add a0, a6, a3
-; RV32ZBB-NEXT:    sltu a2, a0, a6
-; RV32ZBB-NEXT:    add a1, a1, a2
+; RV32ZBB-NEXT:    add a7, a7, a0
+; RV32ZBB-NEXT:    add a0, a6, a1
+; RV32ZBB-NEXT:    sltu a1, a0, a6
+; RV32ZBB-NEXT:    add a1, a7, a1
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: rotl_64_mask_multiple:
@@ -2022,45 +2022,45 @@ define i64 @rotl_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32XTHEADBB-NEXT:    mv a0, a1
 ; RV32XTHEADBB-NEXT:  .LBB21_4:
 ; RV32XTHEADBB-NEXT:    sll a7, a6, a4
-; RV32XTHEADBB-NEXT:    srli t0, a0, 1
+; RV32XTHEADBB-NEXT:    srli t1, a0, 1
 ; RV32XTHEADBB-NEXT:    not a1, a4
-; RV32XTHEADBB-NEXT:    srl t0, t0, a1
-; RV32XTHEADBB-NEXT:    sll t1, a0, a4
+; RV32XTHEADBB-NEXT:    sll t0, a0, a4
 ; RV32XTHEADBB-NEXT:    srli a0, a6, 1
-; RV32XTHEADBB-NEXT:    srl t2, a0, a1
+; RV32XTHEADBB-NEXT:    srl a6, t1, a1
+; RV32XTHEADBB-NEXT:    srl t1, a0, a1
 ; RV32XTHEADBB-NEXT:    mv a0, a3
 ; RV32XTHEADBB-NEXT:    bnez a5, .LBB21_6
 ; RV32XTHEADBB-NEXT:  # %bb.5:
 ; RV32XTHEADBB-NEXT:    mv a0, a2
 ; RV32XTHEADBB-NEXT:  .LBB21_6:
-; RV32XTHEADBB-NEXT:    or a6, a7, t0
-; RV32XTHEADBB-NEXT:    or a7, t1, t2
+; RV32XTHEADBB-NEXT:    or a6, a7, a6
+; RV32XTHEADBB-NEXT:    or a7, t0, t1
 ; RV32XTHEADBB-NEXT:    sll t0, a0, a4
 ; RV32XTHEADBB-NEXT:    bnez a5, .LBB21_8
 ; RV32XTHEADBB-NEXT:  # %bb.7:
 ; RV32XTHEADBB-NEXT:    mv a2, a3
 ; RV32XTHEADBB-NEXT:  .LBB21_8:
 ; RV32XTHEADBB-NEXT:    srli a3, a2, 1
-; RV32XTHEADBB-NEXT:    srl a3, a3, a1
-; RV32XTHEADBB-NEXT:    or a3, t0, a3
 ; RV32XTHEADBB-NEXT:    sll a2, a2, a4
 ; RV32XTHEADBB-NEXT:    srli a0, a0, 1
+; RV32XTHEADBB-NEXT:    srl a3, a3, a1
 ; RV32XTHEADBB-NEXT:    srl a0, a0, a1
+; RV32XTHEADBB-NEXT:    or a1, t0, a3
 ; RV32XTHEADBB-NEXT:    or a0, a2, a0
-; RV32XTHEADBB-NEXT:    add a1, a7, a0
-; RV32XTHEADBB-NEXT:    add a0, a6, a3
-; RV32XTHEADBB-NEXT:    sltu a2, a0, a6
-; RV32XTHEADBB-NEXT:    add a1, a1, a2
+; RV32XTHEADBB-NEXT:    add a7, a7, a0
+; RV32XTHEADBB-NEXT:    add a0, a6, a1
+; RV32XTHEADBB-NEXT:    sltu a1, a0, a6
+; RV32XTHEADBB-NEXT:    add a1, a7, a1
 ; RV32XTHEADBB-NEXT:    ret
 ;
 ; RV64XTHEADBB-LABEL: rotl_64_mask_multiple:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    sll a3, a0, a2
 ; RV64XTHEADBB-NEXT:    negw a4, a2
-; RV64XTHEADBB-NEXT:    srl a0, a0, a4
-; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    sll a2, a1, a2
+; RV64XTHEADBB-NEXT:    srl a0, a0, a4
 ; RV64XTHEADBB-NEXT:    srl a1, a1, a4
+; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    or a1, a2, a1
 ; RV64XTHEADBB-NEXT:    add a0, a0, a1
 ; RV64XTHEADBB-NEXT:    ret
@@ -2076,10 +2076,10 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srl a3, a0, a2
 ; RV32I-NEXT:    neg a4, a2
-; RV32I-NEXT:    sll a0, a0, a4
-; RV32I-NEXT:    or a0, a3, a0
 ; RV32I-NEXT:    srl a2, a1, a2
+; RV32I-NEXT:    sll a0, a0, a4
 ; RV32I-NEXT:    sll a1, a1, a4
+; RV32I-NEXT:    or a0, a3, a0
 ; RV32I-NEXT:    or a1, a2, a1
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -2088,10 +2088,10 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srlw a3, a0, a2
 ; RV64I-NEXT:    negw a4, a2
-; RV64I-NEXT:    sllw a0, a0, a4
-; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    srlw a2, a1, a2
+; RV64I-NEXT:    sllw a0, a0, a4
 ; RV64I-NEXT:    sllw a1, a1, a4
+; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -2114,10 +2114,10 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV32XTHEADBB:       # %bb.0:
 ; RV32XTHEADBB-NEXT:    srl a3, a0, a2
 ; RV32XTHEADBB-NEXT:    neg a4, a2
-; RV32XTHEADBB-NEXT:    sll a0, a0, a4
-; RV32XTHEADBB-NEXT:    or a0, a3, a0
 ; RV32XTHEADBB-NEXT:    srl a2, a1, a2
+; RV32XTHEADBB-NEXT:    sll a0, a0, a4
 ; RV32XTHEADBB-NEXT:    sll a1, a1, a4
+; RV32XTHEADBB-NEXT:    or a0, a3, a0
 ; RV32XTHEADBB-NEXT:    or a1, a2, a1
 ; RV32XTHEADBB-NEXT:    add a0, a0, a1
 ; RV32XTHEADBB-NEXT:    ret
@@ -2126,10 +2126,10 @@ define signext i32 @rotr_32_mask_multiple(i32 signext %a, i32 signext %b, i32 si
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srlw a3, a0, a2
 ; RV64XTHEADBB-NEXT:    negw a4, a2
-; RV64XTHEADBB-NEXT:    sllw a0, a0, a4
-; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    srlw a2, a1, a2
+; RV64XTHEADBB-NEXT:    sllw a0, a0, a4
 ; RV64XTHEADBB-NEXT:    sllw a1, a1, a4
+; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    or a1, a2, a1
 ; RV64XTHEADBB-NEXT:    addw a0, a0, a1
 ; RV64XTHEADBB-NEXT:    ret
@@ -2154,30 +2154,30 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:  .LBB23_4:
 ; RV32I-NEXT:    srl a7, a6, a4
-; RV32I-NEXT:    slli t0, a1, 1
+; RV32I-NEXT:    slli t1, a1, 1
 ; RV32I-NEXT:    not a0, a4
-; RV32I-NEXT:    sll t0, t0, a0
-; RV32I-NEXT:    srl t1, a1, a4
+; RV32I-NEXT:    srl t0, a1, a4
 ; RV32I-NEXT:    slli a6, a6, 1
-; RV32I-NEXT:    sll t2, a6, a0
+; RV32I-NEXT:    sll a1, t1, a0
+; RV32I-NEXT:    sll t1, a6, a0
 ; RV32I-NEXT:    mv a6, a2
 ; RV32I-NEXT:    beqz a5, .LBB23_6
 ; RV32I-NEXT:  # %bb.5:
 ; RV32I-NEXT:    mv a6, a3
 ; RV32I-NEXT:  .LBB23_6:
-; RV32I-NEXT:    or a1, t0, a7
-; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or a1, a1, a7
+; RV32I-NEXT:    or a7, t1, t0
 ; RV32I-NEXT:    srl t0, a6, a4
 ; RV32I-NEXT:    beqz a5, .LBB23_8
 ; RV32I-NEXT:  # %bb.7:
 ; RV32I-NEXT:    mv a3, a2
 ; RV32I-NEXT:  .LBB23_8:
 ; RV32I-NEXT:    slli a2, a3, 1
-; RV32I-NEXT:    sll a2, a2, a0
-; RV32I-NEXT:    or a2, a2, t0
 ; RV32I-NEXT:    srl a3, a3, a4
 ; RV32I-NEXT:    slli a6, a6, 1
+; RV32I-NEXT:    sll a2, a2, a0
 ; RV32I-NEXT:    sll a0, a6, a0
+; RV32I-NEXT:    or a2, a2, t0
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:    add a7, a7, a0
 ; RV32I-NEXT:    add a0, a1, a2
@@ -2189,10 +2189,10 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srl a3, a0, a2
 ; RV64I-NEXT:    negw a4, a2
-; RV64I-NEXT:    sll a0, a0, a4
-; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    srl a2, a1, a2
+; RV64I-NEXT:    sll a0, a0, a4
 ; RV64I-NEXT:    sll a1, a1, a4
+; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -2210,30 +2210,30 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32ZBB-NEXT:    mv a1, a0
 ; RV32ZBB-NEXT:  .LBB23_4:
 ; RV32ZBB-NEXT:    srl a7, a6, a4
-; RV32ZBB-NEXT:    slli t0, a1, 1
+; RV32ZBB-NEXT:    slli t1, a1, 1
 ; RV32ZBB-NEXT:    not a0, a4
-; RV32ZBB-NEXT:    sll t0, t0, a0
-; RV32ZBB-NEXT:    srl t1, a1, a4
+; RV32ZBB-NEXT:    srl t0, a1, a4
 ; RV32ZBB-NEXT:    slli a6, a6, 1
-; RV32ZBB-NEXT:    sll t2, a6, a0
+; RV32ZBB-NEXT:    sll a1, t1, a0
+; RV32ZBB-NEXT:    sll t1, a6, a0
 ; RV32ZBB-NEXT:    mv a6, a2
 ; RV32ZBB-NEXT:    beqz a5, .LBB23_6
 ; RV32ZBB-NEXT:  # %bb.5:
 ; RV32ZBB-NEXT:    mv a6, a3
 ; RV32ZBB-NEXT:  .LBB23_6:
-; RV32ZBB-NEXT:    or a1, t0, a7
-; RV32ZBB-NEXT:    or a7, t2, t1
+; RV32ZBB-NEXT:    or a1, a1, a7
+; RV32ZBB-NEXT:    or a7, t1, t0
 ; RV32ZBB-NEXT:    srl t0, a6, a4
 ; RV32ZBB-NEXT:    beqz a5, .LBB23_8
 ; RV32ZBB-NEXT:  # %bb.7:
 ; RV32ZBB-NEXT:    mv a3, a2
 ; RV32ZBB-NEXT:  .LBB23_8:
 ; RV32ZBB-NEXT:    slli a2, a3, 1
-; RV32ZBB-NEXT:    sll a2, a2, a0
-; RV32ZBB-NEXT:    or a2, a2, t0
 ; RV32ZBB-NEXT:    srl a3, a3, a4
 ; RV32ZBB-NEXT:    slli a6, a6, 1
+; RV32ZBB-NEXT:    sll a2, a2, a0
 ; RV32ZBB-NEXT:    sll a0, a6, a0
+; RV32ZBB-NEXT:    or a2, a2, t0
 ; RV32ZBB-NEXT:    or a0, a0, a3
 ; RV32ZBB-NEXT:    add a7, a7, a0
 ; RV32ZBB-NEXT:    add a0, a1, a2
@@ -2261,30 +2261,30 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV32XTHEADBB-NEXT:    mv a1, a0
 ; RV32XTHEADBB-NEXT:  .LBB23_4:
 ; RV32XTHEADBB-NEXT:    srl a7, a6, a4
-; RV32XTHEADBB-NEXT:    slli t0, a1, 1
+; RV32XTHEADBB-NEXT:    slli t1, a1, 1
 ; RV32XTHEADBB-NEXT:    not a0, a4
-; RV32XTHEADBB-NEXT:    sll t0, t0, a0
-; RV32XTHEADBB-NEXT:    srl t1, a1, a4
+; RV32XTHEADBB-NEXT:    srl t0, a1, a4
 ; RV32XTHEADBB-NEXT:    slli a6, a6, 1
-; RV32XTHEADBB-NEXT:    sll t2, a6, a0
+; RV32XTHEADBB-NEXT:    sll a1, t1, a0
+; RV32XTHEADBB-NEXT:    sll t1, a6, a0
 ; RV32XTHEADBB-NEXT:    mv a6, a2
 ; RV32XTHEADBB-NEXT:    beqz a5, .LBB23_6
 ; RV32XTHEADBB-NEXT:  # %bb.5:
 ; RV32XTHEADBB-NEXT:    mv a6, a3
 ; RV32XTHEADBB-NEXT:  .LBB23_6:
-; RV32XTHEADBB-NEXT:    or a1, t0, a7
-; RV32XTHEADBB-NEXT:    or a7, t2, t1
+; RV32XTHEADBB-NEXT:    or a1, a1, a7
+; RV32XTHEADBB-NEXT:    or a7, t1, t0
 ; RV32XTHEADBB-NEXT:    srl t0, a6, a4
 ; RV32XTHEADBB-NEXT:    beqz a5, .LBB23_8
 ; RV32XTHEADBB-NEXT:  # %bb.7:
 ; RV32XTHEADBB-NEXT:    mv a3, a2
 ; RV32XTHEADBB-NEXT:  .LBB23_8:
 ; RV32XTHEADBB-NEXT:    slli a2, a3, 1
-; RV32XTHEADBB-NEXT:    sll a2, a2, a0
-; RV32XTHEADBB-NEXT:    or a2, a2, t0
 ; RV32XTHEADBB-NEXT:    srl a3, a3, a4
 ; RV32XTHEADBB-NEXT:    slli a6, a6, 1
+; RV32XTHEADBB-NEXT:    sll a2, a2, a0
 ; RV32XTHEADBB-NEXT:    sll a0, a6, a0
+; RV32XTHEADBB-NEXT:    or a2, a2, t0
 ; RV32XTHEADBB-NEXT:    or a0, a0, a3
 ; RV32XTHEADBB-NEXT:    add a7, a7, a0
 ; RV32XTHEADBB-NEXT:    add a0, a1, a2
@@ -2296,10 +2296,10 @@ define i64 @rotr_64_mask_multiple(i64 %a, i64 %b, i64 %amt) nounwind {
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    srl a3, a0, a2
 ; RV64XTHEADBB-NEXT:    negw a4, a2
-; RV64XTHEADBB-NEXT:    sll a0, a0, a4
-; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    srl a2, a1, a2
+; RV64XTHEADBB-NEXT:    sll a0, a0, a4
 ; RV64XTHEADBB-NEXT:    sll a1, a1, a4
+; RV64XTHEADBB-NEXT:    or a0, a3, a0
 ; RV64XTHEADBB-NEXT:    or a1, a2, a1
 ; RV64XTHEADBB-NEXT:    add a0, a0, a1
 ; RV64XTHEADBB-NEXT:    ret
@@ -2328,9 +2328,9 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32I-NEXT:    or a3, a3, a7
 ; RV32I-NEXT:  .LBB24_3:
 ; RV32I-NEXT:    srai a6, a6, 31
+; RV32I-NEXT:    li a7, 32
 ; RV32I-NEXT:    and a5, a6, a5
-; RV32I-NEXT:    li a6, 32
-; RV32I-NEXT:    sub a6, a6, a2
+; RV32I-NEXT:    sub a6, a7, a2
 ; RV32I-NEXT:    srl a7, a1, a4
 ; RV32I-NEXT:    bltz a6, .LBB24_5
 ; RV32I-NEXT:  # %bb.4:
@@ -2338,8 +2338,8 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32I-NEXT:    j .LBB24_6
 ; RV32I-NEXT:  .LBB24_5:
 ; RV32I-NEXT:    li t0, 64
-; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    srl a0, a0, a4
+; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    not a2, a2
 ; RV32I-NEXT:    slli a1, a1, 1
 ; RV32I-NEXT:    sll a1, a1, a2
@@ -2376,9 +2376,9 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32ZBB-NEXT:    or a3, a3, a7
 ; RV32ZBB-NEXT:  .LBB24_3:
 ; RV32ZBB-NEXT:    srai a6, a6, 31
+; RV32ZBB-NEXT:    li a7, 32
 ; RV32ZBB-NEXT:    and a5, a6, a5
-; RV32ZBB-NEXT:    li a6, 32
-; RV32ZBB-NEXT:    sub a6, a6, a2
+; RV32ZBB-NEXT:    sub a6, a7, a2
 ; RV32ZBB-NEXT:    srl a7, a1, a4
 ; RV32ZBB-NEXT:    bltz a6, .LBB24_5
 ; RV32ZBB-NEXT:  # %bb.4:
@@ -2386,8 +2386,8 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32ZBB-NEXT:    j .LBB24_6
 ; RV32ZBB-NEXT:  .LBB24_5:
 ; RV32ZBB-NEXT:    li t0, 64
-; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    srl a0, a0, a4
+; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    not a2, a2
 ; RV32ZBB-NEXT:    slli a1, a1, 1
 ; RV32ZBB-NEXT:    sll a1, a1, a2
@@ -2421,9 +2421,9 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32XTHEADBB-NEXT:    or a3, a3, a7
 ; RV32XTHEADBB-NEXT:  .LBB24_3:
 ; RV32XTHEADBB-NEXT:    srai a6, a6, 31
+; RV32XTHEADBB-NEXT:    li a7, 32
 ; RV32XTHEADBB-NEXT:    and a5, a6, a5
-; RV32XTHEADBB-NEXT:    li a6, 32
-; RV32XTHEADBB-NEXT:    sub a6, a6, a2
+; RV32XTHEADBB-NEXT:    sub a6, a7, a2
 ; RV32XTHEADBB-NEXT:    srl a7, a1, a4
 ; RV32XTHEADBB-NEXT:    bltz a6, .LBB24_5
 ; RV32XTHEADBB-NEXT:  # %bb.4:
@@ -2431,8 +2431,8 @@ define i64 @rotl_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32XTHEADBB-NEXT:    j .LBB24_6
 ; RV32XTHEADBB-NEXT:  .LBB24_5:
 ; RV32XTHEADBB-NEXT:    li t0, 64
-; RV32XTHEADBB-NEXT:    sub a2, t0, a2
 ; RV32XTHEADBB-NEXT:    srl a0, a0, a4
+; RV32XTHEADBB-NEXT:    sub a2, t0, a2
 ; RV32XTHEADBB-NEXT:    not a2, a2
 ; RV32XTHEADBB-NEXT:    slli a1, a1, 1
 ; RV32XTHEADBB-NEXT:    sll a1, a1, a2
@@ -2478,9 +2478,9 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32I-NEXT:    or a3, a3, a7
 ; RV32I-NEXT:  .LBB25_3:
 ; RV32I-NEXT:    srai a6, a6, 31
+; RV32I-NEXT:    li a7, 32
 ; RV32I-NEXT:    and a5, a6, a5
-; RV32I-NEXT:    li a6, 32
-; RV32I-NEXT:    sub a6, a6, a2
+; RV32I-NEXT:    sub a6, a7, a2
 ; RV32I-NEXT:    sll a7, a0, a4
 ; RV32I-NEXT:    bltz a6, .LBB25_5
 ; RV32I-NEXT:  # %bb.4:
@@ -2488,8 +2488,8 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32I-NEXT:    j .LBB25_6
 ; RV32I-NEXT:  .LBB25_5:
 ; RV32I-NEXT:    li t0, 64
-; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    sll a1, a1, a4
+; RV32I-NEXT:    sub a2, t0, a2
 ; RV32I-NEXT:    not a2, a2
 ; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    srl a0, a0, a2
@@ -2526,9 +2526,9 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32ZBB-NEXT:    or a3, a3, a7
 ; RV32ZBB-NEXT:  .LBB25_3:
 ; RV32ZBB-NEXT:    srai a6, a6, 31
+; RV32ZBB-NEXT:    li a7, 32
 ; RV32ZBB-NEXT:    and a5, a6, a5
-; RV32ZBB-NEXT:    li a6, 32
-; RV32ZBB-NEXT:    sub a6, a6, a2
+; RV32ZBB-NEXT:    sub a6, a7, a2
 ; RV32ZBB-NEXT:    sll a7, a0, a4
 ; RV32ZBB-NEXT:    bltz a6, .LBB25_5
 ; RV32ZBB-NEXT:  # %bb.4:
@@ -2536,8 +2536,8 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32ZBB-NEXT:    j .LBB25_6
 ; RV32ZBB-NEXT:  .LBB25_5:
 ; RV32ZBB-NEXT:    li t0, 64
-; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    sll a1, a1, a4
+; RV32ZBB-NEXT:    sub a2, t0, a2
 ; RV32ZBB-NEXT:    not a2, a2
 ; RV32ZBB-NEXT:    srli a0, a0, 1
 ; RV32ZBB-NEXT:    srl a0, a0, a2
@@ -2571,9 +2571,9 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32XTHEADBB-NEXT:    or a3, a3, a7
 ; RV32XTHEADBB-NEXT:  .LBB25_3:
 ; RV32XTHEADBB-NEXT:    srai a6, a6, 31
+; RV32XTHEADBB-NEXT:    li a7, 32
 ; RV32XTHEADBB-NEXT:    and a5, a6, a5
-; RV32XTHEADBB-NEXT:    li a6, 32
-; RV32XTHEADBB-NEXT:    sub a6, a6, a2
+; RV32XTHEADBB-NEXT:    sub a6, a7, a2
 ; RV32XTHEADBB-NEXT:    sll a7, a0, a4
 ; RV32XTHEADBB-NEXT:    bltz a6, .LBB25_5
 ; RV32XTHEADBB-NEXT:  # %bb.4:
@@ -2581,8 +2581,8 @@ define i64 @rotr_64_zext(i64 %x, i32 %y) nounwind {
 ; RV32XTHEADBB-NEXT:    j .LBB25_6
 ; RV32XTHEADBB-NEXT:  .LBB25_5:
 ; RV32XTHEADBB-NEXT:    li t0, 64
-; RV32XTHEADBB-NEXT:    sub a2, t0, a2
 ; RV32XTHEADBB-NEXT:    sll a1, a1, a4
+; RV32XTHEADBB-NEXT:    sub a2, t0, a2
 ; RV32XTHEADBB-NEXT:    not a2, a2
 ; RV32XTHEADBB-NEXT:    srli a0, a0, 1
 ; RV32XTHEADBB-NEXT:    srl a0, a0, a2

diff  --git a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
index 248d6209d58239..4bb8d6c248caa2 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll
@@ -12,31 +12,31 @@ define i32 @ctlz_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    beqz a0, .LBB0_2
 ; RV32I-NEXT:  # %bb.1: # %cond.false
 ; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 4
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 16
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    addi a1, a2, -241
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
@@ -62,11 +62,11 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV32I-LABEL: ctlz_i64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    lui a5, 61681
 ; RV32I-NEXT:    addi a4, a2, 1365
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a3, a2, 819
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    addi a3, a3, 819
+; RV32I-NEXT:    addi a2, a5, -241
 ; RV32I-NEXT:    bnez a1, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    srli a1, a0, 1
@@ -440,11 +440,11 @@ define i32 @bswap_i32(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    srli a3, a0, 24
 ; RV32I-NEXT:    addi a2, a2, -256
 ; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    and a2, a0, a2
+; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    slli a2, a2, 8
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a2
@@ -466,25 +466,24 @@ define i64 @bswap_i64(i64 %a) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a1, 8
 ; RV32I-NEXT:    lui a3, 16
+; RV32I-NEXT:    srli a4, a1, 24
+; RV32I-NEXT:    srli a5, a0, 8
 ; RV32I-NEXT:    addi a3, a3, -256
 ; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    srli a4, a1, 24
 ; RV32I-NEXT:    or a2, a2, a4
-; RV32I-NEXT:    and a4, a1, a3
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a4
-; RV32I-NEXT:    or a2, a1, a2
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    srli a4, a0, 24
-; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    and a5, a5, a3
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a5, a1, 24
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or a1, a5, a1
 ; RV32I-NEXT:    and a3, a0, a3
-; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a0, a3
+; RV32I-NEXT:    or a0, a1, a2
+; RV32I-NEXT:    or a1, a3, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV32XTHEADBB-LABEL: bswap_i64:

diff  --git a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll
index 4e958f5699adbf..b6344f88cddaa5 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll
@@ -138,27 +138,26 @@ declare i64 @llvm.fshl.i64(i64, i64, i64)
 define i64 @rol_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: rol_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    slli a3, a2, 26
-; CHECK-NEXT:    srli a3, a3, 31
-; CHECK-NEXT:    mv a4, a1
-; CHECK-NEXT:    bnez a3, .LBB7_2
+; CHECK-NEXT:    slli a5, a2, 26
+; CHECK-NEXT:    srli a5, a5, 31
+; CHECK-NEXT:    mv a3, a1
+; CHECK-NEXT:    bnez a5, .LBB7_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a4, a0
+; CHECK-NEXT:    mv a3, a0
 ; CHECK-NEXT:  .LBB7_2:
-; CHECK-NEXT:    sll a5, a4, a2
-; CHECK-NEXT:    bnez a3, .LBB7_4
+; CHECK-NEXT:    sll a4, a3, a2
+; CHECK-NEXT:    bnez a5, .LBB7_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB7_4:
 ; CHECK-NEXT:    srli a1, a0, 1
-; CHECK-NEXT:    not a6, a2
-; CHECK-NEXT:    srl a3, a1, a6
-; CHECK-NEXT:    or a3, a5, a3
-; CHECK-NEXT:    sll a0, a0, a2
-; CHECK-NEXT:    srli a4, a4, 1
-; CHECK-NEXT:    srl a1, a4, a6
-; CHECK-NEXT:    or a1, a0, a1
-; CHECK-NEXT:    mv a0, a3
+; CHECK-NEXT:    not a5, a2
+; CHECK-NEXT:    sll a2, a0, a2
+; CHECK-NEXT:    srli a3, a3, 1
+; CHECK-NEXT:    srl a0, a1, a5
+; CHECK-NEXT:    srl a1, a3, a5
+; CHECK-NEXT:    or a0, a4, a0
+; CHECK-NEXT:    or a1, a2, a1
 ; CHECK-NEXT:    ret
   %or = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %b)
   ret i64 %or
@@ -191,24 +190,24 @@ declare i64 @llvm.fshr.i64(i64, i64, i64)
 define i64 @ror_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: ror_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andi a4, a2, 32
+; CHECK-NEXT:    andi a5, a2, 32
 ; CHECK-NEXT:    mv a3, a0
-; CHECK-NEXT:    beqz a4, .LBB9_2
+; CHECK-NEXT:    beqz a5, .LBB9_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:  .LBB9_2:
-; CHECK-NEXT:    srl a5, a3, a2
-; CHECK-NEXT:    beqz a4, .LBB9_4
+; CHECK-NEXT:    srl a4, a3, a2
+; CHECK-NEXT:    beqz a5, .LBB9_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:  .LBB9_4:
 ; CHECK-NEXT:    slli a0, a1, 1
-; CHECK-NEXT:    not a4, a2
-; CHECK-NEXT:    sll a0, a0, a4
-; CHECK-NEXT:    or a0, a0, a5
+; CHECK-NEXT:    not a5, a2
 ; CHECK-NEXT:    srl a1, a1, a2
 ; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    sll a2, a3, a4
+; CHECK-NEXT:    sll a0, a0, a5
+; CHECK-NEXT:    sll a2, a3, a5
+; CHECK-NEXT:    or a0, a0, a4
 ; CHECK-NEXT:    or a1, a2, a1
 ; CHECK-NEXT:    ret
   %or = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %b)
@@ -252,11 +251,10 @@ define i64 @rori_i64(i64 %a) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    srli a2, a0, 1
 ; CHECK-NEXT:    slli a3, a1, 31
-; CHECK-NEXT:    or a2, a3, a2
 ; CHECK-NEXT:    srli a1, a1, 1
-; CHECK-NEXT:    slli a0, a0, 31
-; CHECK-NEXT:    or a1, a0, a1
-; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    slli a4, a0, 31
+; CHECK-NEXT:    or a0, a3, a2
+; CHECK-NEXT:    or a1, a4, a1
 ; CHECK-NEXT:    ret
   %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 63)
   ret i64 %1
@@ -267,11 +265,10 @@ define i64 @rori_i64_fshr(i64 %a) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    srli a2, a1, 31
 ; CHECK-NEXT:    slli a3, a0, 1
-; CHECK-NEXT:    or a2, a3, a2
-; CHECK-NEXT:    srli a0, a0, 31
+; CHECK-NEXT:    srli a4, a0, 31
 ; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    or a1, a1, a0
-; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    or a0, a3, a2
+; CHECK-NEXT:    or a1, a1, a4
 ; CHECK-NEXT:    ret
   %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 63)
   ret i64 %1
@@ -299,12 +296,12 @@ define i64 @not_shl_one_i64(i64 %x) {
 ; CHECK-LABEL: not_shl_one_i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi a1, a0, -32
+; CHECK-NEXT:    li a2, 1
 ; CHECK-NEXT:    slti a1, a1, 0
+; CHECK-NEXT:    sll a0, a2, a0
 ; CHECK-NEXT:    neg a2, a1
-; CHECK-NEXT:    li a3, 1
-; CHECK-NEXT:    sll a0, a3, a0
-; CHECK-NEXT:    and a2, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a2, a2, a0
 ; CHECK-NEXT:    and a1, a1, a0
 ; CHECK-NEXT:    not a0, a2
 ; CHECK-NEXT:    not a1, a1

diff  --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index af2ea35cf26c1b..90a8eadb3f974d 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -12,31 +12,31 @@ define i32 @ctlz_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    beqz a0, .LBB0_2
 ; RV32I-NEXT:  # %bb.1: # %cond.false
 ; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 2
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 4
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 16
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    srli a1, a0, 1
-; RV32I-NEXT:    lui a2, 349525
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    addi a1, a2, -241
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
@@ -62,11 +62,11 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV32I-LABEL: ctlz_i64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    lui a3, 209715
+; RV32I-NEXT:    lui a5, 61681
 ; RV32I-NEXT:    addi a4, a2, 1365
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a3, a2, 819
-; RV32I-NEXT:    lui a2, 61681
-; RV32I-NEXT:    addi a2, a2, -241
+; RV32I-NEXT:    addi a3, a3, 819
+; RV32I-NEXT:    addi a2, a5, -241
 ; RV32I-NEXT:    bnez a1, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    srli a1, a0, 1
@@ -257,17 +257,17 @@ define i32 @ctpop_i32(i32 %a) nounwind {
 ; RV32I-NEXT:    lui a2, 349525
 ; RV32I-NEXT:    addi a2, a2, 1365
 ; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 209715
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lui a1, 61681
-; RV32I-NEXT:    addi a1, a1, -241
+; RV32I-NEXT:    addi a1, a2, -241
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    slli a1, a0, 8
 ; RV32I-NEXT:    add a0, a0, a1
@@ -367,39 +367,39 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 1
 ; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a1, 1
 ; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    sub a0, a0, a2
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a4, a0, a2
-; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    add a0, a4, a0
-; RV32I-NEXT:    srli a4, a0, 4
-; RV32I-NEXT:    add a0, a0, a4
-; RV32I-NEXT:    lui a4, 61681
-; RV32I-NEXT:    addi a4, a4, -241
-; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    slli a5, a0, 8
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    slli a5, a0, 16
-; RV32I-NEXT:    add a0, a0, a5
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    srli a5, a1, 1
 ; RV32I-NEXT:    and a3, a5, a3
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a5, a5, -241
+; RV32I-NEXT:    sub a0, a0, a2
 ; RV32I-NEXT:    sub a1, a1, a3
-; RV32I-NEXT:    and a3, a1, a2
+; RV32I-NEXT:    and a2, a0, a4
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a3, a1, a4
 ; RV32I-NEXT:    srli a1, a1, 2
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    srli a2, a1, 4
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    and a0, a0, a4
 ; RV32I-NEXT:    and a1, a1, a4
-; RV32I-NEXT:    slli a2, a1, 8
-; RV32I-NEXT:    add a1, a1, a2
-; RV32I-NEXT:    slli a2, a1, 16
-; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    srli a3, a1, 4
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    and a0, a0, a5
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    slli a2, a0, 8
+; RV32I-NEXT:    slli a3, a1, 8
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    slli a3, a1, 16
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a1, a3
+; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    ret
 ;
@@ -417,9 +417,9 @@ define <2 x i1> @ctpop_v2i32_ult_two(<2 x i32> %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a2, a0, -1
 ; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    seqz a0, a0
 ; RV32I-NEXT:    addi a2, a1, -1
 ; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    seqz a0, a0
 ; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    ret
 ;
@@ -440,9 +440,9 @@ define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a2, a0, -1
 ; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    snez a0, a0
 ; RV32I-NEXT:    addi a2, a1, -1
 ; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    snez a0, a0
 ; RV32I-NEXT:    snez a1, a1
 ; RV32I-NEXT:    ret
 ;
@@ -451,8 +451,8 @@ define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind {
 ; RV32ZBB-NEXT:    cpop a1, a1
 ; RV32ZBB-NEXT:    cpop a0, a0
 ; RV32ZBB-NEXT:    sltiu a0, a0, 2
-; RV32ZBB-NEXT:    xori a0, a0, 1
 ; RV32ZBB-NEXT:    sltiu a1, a1, 2
+; RV32ZBB-NEXT:    xori a0, a0, 1
 ; RV32ZBB-NEXT:    xori a1, a1, 1
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
@@ -476,8 +476,8 @@ define <2 x i1> @ctpop_v2i32_eq_one(<2 x i32> %a) nounwind {
 ; RV32ZBB-NEXT:    cpop a1, a1
 ; RV32ZBB-NEXT:    cpop a0, a0
 ; RV32ZBB-NEXT:    addi a0, a0, -1
-; RV32ZBB-NEXT:    seqz a0, a0
 ; RV32ZBB-NEXT:    addi a1, a1, -1
+; RV32ZBB-NEXT:    seqz a0, a0
 ; RV32ZBB-NEXT:    seqz a1, a1
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
@@ -491,10 +491,10 @@ define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind {
 ; RV32I-NEXT:    addi a2, a0, -1
 ; RV32I-NEXT:    xor a0, a0, a2
 ; RV32I-NEXT:    sltu a0, a2, a0
-; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    addi a2, a1, -1
 ; RV32I-NEXT:    xor a1, a1, a2
 ; RV32I-NEXT:    sltu a1, a2, a1
+; RV32I-NEXT:    xori a0, a0, 1
 ; RV32I-NEXT:    xori a1, a1, 1
 ; RV32I-NEXT:    ret
 ;
@@ -503,8 +503,8 @@ define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind {
 ; RV32ZBB-NEXT:    cpop a1, a1
 ; RV32ZBB-NEXT:    cpop a0, a0
 ; RV32ZBB-NEXT:    addi a0, a0, -1
-; RV32ZBB-NEXT:    snez a0, a0
 ; RV32ZBB-NEXT:    addi a1, a1, -1
+; RV32ZBB-NEXT:    snez a0, a0
 ; RV32ZBB-NEXT:    snez a1, a1
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
@@ -519,39 +519,39 @@ define i64 @ctpop_i64(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a1, 1
 ; RV32I-NEXT:    lui a3, 349525
+; RV32I-NEXT:    lui a4, 209715
+; RV32I-NEXT:    srli a5, a0, 1
 ; RV32I-NEXT:    addi a3, a3, 1365
 ; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    lui a2, 209715
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a4, a1, a2
-; RV32I-NEXT:    srli a1, a1, 2
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    add a1, a4, a1
-; RV32I-NEXT:    srli a4, a1, 4
-; RV32I-NEXT:    add a1, a1, a4
-; RV32I-NEXT:    lui a4, 61681
-; RV32I-NEXT:    addi a4, a4, -241
-; RV32I-NEXT:    and a1, a1, a4
-; RV32I-NEXT:    slli a5, a1, 8
-; RV32I-NEXT:    add a1, a1, a5
-; RV32I-NEXT:    slli a5, a1, 16
-; RV32I-NEXT:    add a1, a1, a5
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    srli a5, a0, 1
 ; RV32I-NEXT:    and a3, a5, a3
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a4, a4, 819
+; RV32I-NEXT:    addi a5, a5, -241
+; RV32I-NEXT:    sub a1, a1, a2
 ; RV32I-NEXT:    sub a0, a0, a3
-; RV32I-NEXT:    and a3, a0, a2
+; RV32I-NEXT:    and a2, a1, a4
+; RV32I-NEXT:    srli a1, a1, 2
+; RV32I-NEXT:    and a3, a0, a4
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    add a0, a3, a0
-; RV32I-NEXT:    srli a2, a0, 4
-; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a4
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    slli a2, a0, 8
-; RV32I-NEXT:    add a0, a0, a2
-; RV32I-NEXT:    slli a2, a0, 16
-; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    srli a2, a1, 4
+; RV32I-NEXT:    srli a3, a0, 4
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    add a0, a0, a3
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    and a0, a0, a5
+; RV32I-NEXT:    slli a2, a1, 8
+; RV32I-NEXT:    slli a3, a0, 8
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    add a0, a0, a3
+; RV32I-NEXT:    slli a2, a1, 16
+; RV32I-NEXT:    slli a3, a0, 16
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    add a0, a0, a3
+; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    srli a0, a0, 24
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    li a1, 0
@@ -682,77 +682,77 @@ declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
 define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV32I-LABEL: ctpop_v2i64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lw a3, 4(a1)
-; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a2, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    srli a5, a3, 1
-; RV32I-NEXT:    lui a6, 349525
-; RV32I-NEXT:    addi a6, a6, 1365
-; RV32I-NEXT:    and a5, a5, a6
-; RV32I-NEXT:    sub a3, a3, a5
-; RV32I-NEXT:    lui a5, 209715
-; RV32I-NEXT:    addi a5, a5, 819
-; RV32I-NEXT:    and a7, a3, a5
+; RV32I-NEXT:    lui a5, 349525
+; RV32I-NEXT:    addi a5, a5, 1365
+; RV32I-NEXT:    srli a6, a4, 1
+; RV32I-NEXT:    srli a7, a3, 1
+; RV32I-NEXT:    srli t0, a1, 1
+; RV32I-NEXT:    srli t1, a2, 1
+; RV32I-NEXT:    and a6, a6, a5
+; RV32I-NEXT:    and a7, a7, a5
+; RV32I-NEXT:    and t0, t0, a5
+; RV32I-NEXT:    and a5, t1, a5
+; RV32I-NEXT:    lui t1, 209715
+; RV32I-NEXT:    addi t1, t1, 819
+; RV32I-NEXT:    sub a4, a4, a6
+; RV32I-NEXT:    sub a3, a3, a7
+; RV32I-NEXT:    sub a1, a1, t0
+; RV32I-NEXT:    sub a2, a2, a5
+; RV32I-NEXT:    and a5, a4, t1
+; RV32I-NEXT:    srli a4, a4, 2
+; RV32I-NEXT:    and a6, a3, t1
 ; RV32I-NEXT:    srli a3, a3, 2
-; RV32I-NEXT:    and a3, a3, a5
-; RV32I-NEXT:    add a3, a7, a3
+; RV32I-NEXT:    and a7, a1, t1
+; RV32I-NEXT:    srli a1, a1, 2
+; RV32I-NEXT:    and t0, a2, t1
+; RV32I-NEXT:    srli a2, a2, 2
+; RV32I-NEXT:    and a4, a4, t1
+; RV32I-NEXT:    and a3, a3, t1
+; RV32I-NEXT:    and a1, a1, t1
+; RV32I-NEXT:    and a2, a2, t1
+; RV32I-NEXT:    add a4, a5, a4
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a5, a5, -241
+; RV32I-NEXT:    add a3, a6, a3
+; RV32I-NEXT:    add a1, a7, a1
+; RV32I-NEXT:    add a2, t0, a2
+; RV32I-NEXT:    srli a6, a4, 4
 ; RV32I-NEXT:    srli a7, a3, 4
+; RV32I-NEXT:    srli t0, a1, 4
+; RV32I-NEXT:    add a4, a4, a6
+; RV32I-NEXT:    srli a6, a2, 4
 ; RV32I-NEXT:    add a3, a3, a7
-; RV32I-NEXT:    lui a7, 61681
-; RV32I-NEXT:    addi a7, a7, -241
-; RV32I-NEXT:    and a3, a3, a7
-; RV32I-NEXT:    slli t0, a3, 8
-; RV32I-NEXT:    add a3, a3, t0
-; RV32I-NEXT:    slli t0, a3, 16
-; RV32I-NEXT:    add a3, a3, t0
-; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    srli t0, a4, 1
-; RV32I-NEXT:    and t0, t0, a6
-; RV32I-NEXT:    sub a4, a4, t0
-; RV32I-NEXT:    and t0, a4, a5
-; RV32I-NEXT:    srli a4, a4, 2
+; RV32I-NEXT:    add a1, a1, t0
+; RV32I-NEXT:    add a2, a2, a6
 ; RV32I-NEXT:    and a4, a4, a5
-; RV32I-NEXT:    add a4, t0, a4
-; RV32I-NEXT:    srli t0, a4, 4
-; RV32I-NEXT:    add a4, a4, t0
-; RV32I-NEXT:    and a4, a4, a7
-; RV32I-NEXT:    slli t0, a4, 8
-; RV32I-NEXT:    add a4, a4, t0
-; RV32I-NEXT:    slli t0, a4, 16
-; RV32I-NEXT:    add a4, a4, t0
-; RV32I-NEXT:    srli a4, a4, 24
-; RV32I-NEXT:    add a3, a4, a3
-; RV32I-NEXT:    srli a4, a1, 1
-; RV32I-NEXT:    and a4, a4, a6
-; RV32I-NEXT:    sub a1, a1, a4
-; RV32I-NEXT:    and a4, a1, a5
-; RV32I-NEXT:    srli a1, a1, 2
+; RV32I-NEXT:    and a3, a3, a5
 ; RV32I-NEXT:    and a1, a1, a5
-; RV32I-NEXT:    add a1, a4, a1
-; RV32I-NEXT:    srli a4, a1, 4
-; RV32I-NEXT:    add a1, a1, a4
-; RV32I-NEXT:    and a1, a1, a7
-; RV32I-NEXT:    slli a4, a1, 8
-; RV32I-NEXT:    add a1, a1, a4
-; RV32I-NEXT:    slli a4, a1, 16
-; RV32I-NEXT:    add a1, a1, a4
-; RV32I-NEXT:    srli a1, a1, 24
-; RV32I-NEXT:    srli a4, a2, 1
-; RV32I-NEXT:    and a4, a4, a6
-; RV32I-NEXT:    sub a2, a2, a4
-; RV32I-NEXT:    and a4, a2, a5
-; RV32I-NEXT:    srli a2, a2, 2
 ; RV32I-NEXT:    and a2, a2, a5
-; RV32I-NEXT:    add a2, a4, a2
-; RV32I-NEXT:    srli a4, a2, 4
-; RV32I-NEXT:    add a2, a2, a4
-; RV32I-NEXT:    and a2, a2, a7
-; RV32I-NEXT:    slli a4, a2, 8
-; RV32I-NEXT:    add a2, a2, a4
-; RV32I-NEXT:    slli a4, a2, 16
-; RV32I-NEXT:    add a2, a2, a4
+; RV32I-NEXT:    slli a5, a4, 8
+; RV32I-NEXT:    slli a6, a3, 8
+; RV32I-NEXT:    slli a7, a1, 8
+; RV32I-NEXT:    slli t0, a2, 8
+; RV32I-NEXT:    add a4, a4, a5
+; RV32I-NEXT:    add a3, a3, a6
+; RV32I-NEXT:    add a1, a1, a7
+; RV32I-NEXT:    add a2, a2, t0
+; RV32I-NEXT:    slli a5, a4, 16
+; RV32I-NEXT:    slli a6, a3, 16
+; RV32I-NEXT:    slli a7, a1, 16
+; RV32I-NEXT:    slli t0, a2, 16
+; RV32I-NEXT:    add a4, a4, a5
+; RV32I-NEXT:    add a3, a3, a6
+; RV32I-NEXT:    add a1, a1, a7
+; RV32I-NEXT:    add a2, a2, t0
+; RV32I-NEXT:    srli a4, a4, 24
+; RV32I-NEXT:    srli a3, a3, 24
+; RV32I-NEXT:    srli a1, a1, 24
 ; RV32I-NEXT:    srli a2, a2, 24
+; RV32I-NEXT:    add a3, a3, a4
 ; RV32I-NEXT:    add a1, a2, a1
 ; RV32I-NEXT:    sw a3, 0(a0)
 ; RV32I-NEXT:    sw zero, 4(a0)
@@ -764,14 +764,14 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a2, 4(a1)
 ; RV32ZBB-NEXT:    lw a3, 0(a1)
-; RV32ZBB-NEXT:    lw a4, 8(a1)
-; RV32ZBB-NEXT:    lw a1, 12(a1)
+; RV32ZBB-NEXT:    lw a4, 12(a1)
+; RV32ZBB-NEXT:    lw a1, 8(a1)
 ; RV32ZBB-NEXT:    cpop a2, a2
 ; RV32ZBB-NEXT:    cpop a3, a3
-; RV32ZBB-NEXT:    add a2, a3, a2
+; RV32ZBB-NEXT:    cpop a4, a4
 ; RV32ZBB-NEXT:    cpop a1, a1
-; RV32ZBB-NEXT:    cpop a3, a4
-; RV32ZBB-NEXT:    add a1, a3, a1
+; RV32ZBB-NEXT:    add a2, a3, a2
+; RV32ZBB-NEXT:    add a1, a1, a4
 ; RV32ZBB-NEXT:    sw a2, 0(a0)
 ; RV32ZBB-NEXT:    sw zero, 4(a0)
 ; RV32ZBB-NEXT:    sw a1, 8(a0)
@@ -787,35 +787,35 @@ define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind {
 ; RV32I-NEXT:    lw a1, 0(a0)
 ; RV32I-NEXT:    lw a2, 4(a0)
 ; RV32I-NEXT:    lw a3, 8(a0)
-; RV32I-NEXT:    lw a4, 12(a0)
-; RV32I-NEXT:    addi a0, a1, -1
-; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    addi a4, a1, -1
+; RV32I-NEXT:    and a4, a1, a4
 ; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    and a1, a2, a1
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    seqz a0, a0
-; RV32I-NEXT:    addi a1, a3, -1
-; RV32I-NEXT:    and a1, a3, a1
-; RV32I-NEXT:    seqz a2, a3
-; RV32I-NEXT:    sub a2, a4, a2
-; RV32I-NEXT:    and a2, a4, a2
-; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    seqz a1, a1
+; RV32I-NEXT:    addi a2, a3, -1
+; RV32I-NEXT:    and a2, a3, a2
+; RV32I-NEXT:    seqz a3, a3
+; RV32I-NEXT:    sub a3, a0, a3
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    or a2, a2, a0
+; RV32I-NEXT:    seqz a0, a1
+; RV32I-NEXT:    seqz a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: ctpop_v2i64_ult_two:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a1, 12(a0)
 ; RV32ZBB-NEXT:    lw a2, 8(a0)
-; RV32ZBB-NEXT:    lw a3, 0(a0)
-; RV32ZBB-NEXT:    lw a0, 4(a0)
+; RV32ZBB-NEXT:    lw a3, 4(a0)
+; RV32ZBB-NEXT:    lw a0, 0(a0)
 ; RV32ZBB-NEXT:    cpop a1, a1
 ; RV32ZBB-NEXT:    cpop a2, a2
-; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    cpop a3, a3
 ; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    cpop a2, a3
-; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    add a0, a0, a3
 ; RV32ZBB-NEXT:    sltiu a0, a0, 2
 ; RV32ZBB-NEXT:    sltiu a1, a1, 2
 ; RV32ZBB-NEXT:    ret
@@ -830,38 +830,38 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind {
 ; RV32I-NEXT:    lw a1, 0(a0)
 ; RV32I-NEXT:    lw a2, 4(a0)
 ; RV32I-NEXT:    lw a3, 8(a0)
-; RV32I-NEXT:    lw a4, 12(a0)
-; RV32I-NEXT:    addi a0, a1, -1
-; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    addi a4, a1, -1
+; RV32I-NEXT:    and a4, a1, a4
 ; RV32I-NEXT:    seqz a1, a1
 ; RV32I-NEXT:    sub a1, a2, a1
 ; RV32I-NEXT:    and a1, a2, a1
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    snez a0, a0
-; RV32I-NEXT:    addi a1, a3, -1
-; RV32I-NEXT:    and a1, a3, a1
-; RV32I-NEXT:    seqz a2, a3
-; RV32I-NEXT:    sub a2, a4, a2
-; RV32I-NEXT:    and a2, a4, a2
-; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    snez a1, a1
+; RV32I-NEXT:    addi a2, a3, -1
+; RV32I-NEXT:    and a2, a3, a2
+; RV32I-NEXT:    seqz a3, a3
+; RV32I-NEXT:    sub a3, a0, a3
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    or a1, a4, a1
+; RV32I-NEXT:    or a2, a2, a0
+; RV32I-NEXT:    snez a0, a1
+; RV32I-NEXT:    snez a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: ctpop_v2i64_ugt_one:
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a1, 12(a0)
 ; RV32ZBB-NEXT:    lw a2, 8(a0)
-; RV32ZBB-NEXT:    lw a3, 0(a0)
-; RV32ZBB-NEXT:    lw a0, 4(a0)
+; RV32ZBB-NEXT:    lw a3, 4(a0)
+; RV32ZBB-NEXT:    lw a0, 0(a0)
 ; RV32ZBB-NEXT:    cpop a1, a1
 ; RV32ZBB-NEXT:    cpop a2, a2
-; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    cpop a3, a3
 ; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    cpop a2, a3
-; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    add a0, a0, a3
 ; RV32ZBB-NEXT:    sltiu a0, a0, 2
-; RV32ZBB-NEXT:    xori a0, a0, 1
 ; RV32ZBB-NEXT:    sltiu a1, a1, 2
+; RV32ZBB-NEXT:    xori a0, a0, 1
 ; RV32ZBB-NEXT:    xori a1, a1, 1
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
@@ -906,17 +906,17 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a1, 12(a0)
 ; RV32ZBB-NEXT:    lw a2, 8(a0)
-; RV32ZBB-NEXT:    lw a3, 0(a0)
-; RV32ZBB-NEXT:    lw a0, 4(a0)
+; RV32ZBB-NEXT:    lw a3, 4(a0)
+; RV32ZBB-NEXT:    lw a0, 0(a0)
 ; RV32ZBB-NEXT:    cpop a1, a1
 ; RV32ZBB-NEXT:    cpop a2, a2
-; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    cpop a3, a3
 ; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    cpop a2, a3
-; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    add a0, a0, a3
 ; RV32ZBB-NEXT:    addi a0, a0, -1
-; RV32ZBB-NEXT:    seqz a0, a0
 ; RV32ZBB-NEXT:    addi a1, a1, -1
+; RV32ZBB-NEXT:    seqz a0, a0
 ; RV32ZBB-NEXT:    seqz a1, a1
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
@@ -963,17 +963,17 @@ define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    lw a1, 12(a0)
 ; RV32ZBB-NEXT:    lw a2, 8(a0)
-; RV32ZBB-NEXT:    lw a3, 0(a0)
-; RV32ZBB-NEXT:    lw a0, 4(a0)
+; RV32ZBB-NEXT:    lw a3, 4(a0)
+; RV32ZBB-NEXT:    lw a0, 0(a0)
 ; RV32ZBB-NEXT:    cpop a1, a1
 ; RV32ZBB-NEXT:    cpop a2, a2
-; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    cpop a3, a3
 ; RV32ZBB-NEXT:    cpop a0, a0
-; RV32ZBB-NEXT:    cpop a2, a3
-; RV32ZBB-NEXT:    add a0, a2, a0
+; RV32ZBB-NEXT:    add a1, a2, a1
+; RV32ZBB-NEXT:    add a0, a0, a3
 ; RV32ZBB-NEXT:    addi a0, a0, -1
-; RV32ZBB-NEXT:    snez a0, a0
 ; RV32ZBB-NEXT:    addi a1, a1, -1
+; RV32ZBB-NEXT:    snez a0, a0
 ; RV32ZBB-NEXT:    snez a1, a1
 ; RV32ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
@@ -1300,11 +1300,11 @@ define i32 @bswap_i32(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    srli a3, a0, 24
 ; RV32I-NEXT:    addi a2, a2, -256
 ; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    and a2, a0, a2
+; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    slli a2, a2, 8
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a2
@@ -1326,25 +1326,24 @@ define i64 @bswap_i64(i64 %a) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a1, 8
 ; RV32I-NEXT:    lui a3, 16
+; RV32I-NEXT:    srli a4, a1, 24
+; RV32I-NEXT:    srli a5, a0, 8
 ; RV32I-NEXT:    addi a3, a3, -256
 ; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    srli a4, a1, 24
 ; RV32I-NEXT:    or a2, a2, a4
-; RV32I-NEXT:    and a4, a1, a3
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a4
-; RV32I-NEXT:    or a2, a1, a2
-; RV32I-NEXT:    srli a1, a0, 8
-; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    srli a4, a0, 24
-; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    and a5, a5, a3
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a5, a1, 24
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    slli a1, a1, 8
+; RV32I-NEXT:    or a1, a5, a1
 ; RV32I-NEXT:    and a3, a0, a3
-; RV32I-NEXT:    slli a3, a3, 8
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a0, a3
+; RV32I-NEXT:    or a0, a1, a2
+; RV32I-NEXT:    or a1, a3, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: bswap_i64:
@@ -1405,12 +1404,12 @@ define i64 @orc_b_i64(i64 %a) {
 ; CHECK-NEXT:    and a1, a1, a2
 ; CHECK-NEXT:    and a0, a0, a2
 ; CHECK-NEXT:    slli a2, a0, 8
-; CHECK-NEXT:    sltu a3, a2, a0
-; CHECK-NEXT:    srli a4, a0, 24
-; CHECK-NEXT:    slli a5, a1, 8
-; CHECK-NEXT:    or a4, a5, a4
-; CHECK-NEXT:    sub a1, a4, a1
-; CHECK-NEXT:    sub a1, a1, a3
+; CHECK-NEXT:    srli a3, a0, 24
+; CHECK-NEXT:    slli a4, a1, 8
+; CHECK-NEXT:    sltu a5, a2, a0
+; CHECK-NEXT:    or a3, a4, a3
+; CHECK-NEXT:    sub a1, a3, a1
+; CHECK-NEXT:    sub a1, a1, a5
 ; CHECK-NEXT:    sub a0, a2, a0
 ; CHECK-NEXT:    ret
   %1 = and i64 %a, 72340172838076673

diff  --git a/llvm/test/CodeGen/RISCV/rv32zbs.ll b/llvm/test/CodeGen/RISCV/rv32zbs.ll
index c0b9e0b3c7748e..1a3beeb79b85bd 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbs.ll
@@ -49,14 +49,14 @@ define i64 @bclr_i64(i64 %a, i64 %b) nounwind {
 ; RV32I-LABEL: bclr_i64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    andi a3, a2, 63
-; RV32I-NEXT:    addi a4, a3, -32
-; RV32I-NEXT:    slti a4, a4, 0
+; RV32I-NEXT:    li a4, 1
+; RV32I-NEXT:    addi a5, a3, -32
+; RV32I-NEXT:    sll a2, a4, a2
+; RV32I-NEXT:    sll a3, a4, a3
+; RV32I-NEXT:    slti a4, a5, 0
 ; RV32I-NEXT:    neg a5, a4
-; RV32I-NEXT:    li a6, 1
-; RV32I-NEXT:    sll a2, a6, a2
-; RV32I-NEXT:    and a2, a5, a2
-; RV32I-NEXT:    sll a3, a6, a3
 ; RV32I-NEXT:    addi a4, a4, -1
+; RV32I-NEXT:    and a2, a5, a2
 ; RV32I-NEXT:    and a3, a4, a3
 ; RV32I-NEXT:    not a2, a2
 ; RV32I-NEXT:    not a3, a3
@@ -67,13 +67,13 @@ define i64 @bclr_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBSNOZBB-LABEL: bclr_i64:
 ; RV32ZBSNOZBB:       # %bb.0:
 ; RV32ZBSNOZBB-NEXT:    andi a3, a2, 63
+; RV32ZBSNOZBB-NEXT:    bset a2, zero, a2
 ; RV32ZBSNOZBB-NEXT:    addi a4, a3, -32
+; RV32ZBSNOZBB-NEXT:    bset a3, zero, a3
 ; RV32ZBSNOZBB-NEXT:    slti a4, a4, 0
 ; RV32ZBSNOZBB-NEXT:    neg a5, a4
-; RV32ZBSNOZBB-NEXT:    bset a2, zero, a2
-; RV32ZBSNOZBB-NEXT:    and a2, a5, a2
-; RV32ZBSNOZBB-NEXT:    bset a3, zero, a3
 ; RV32ZBSNOZBB-NEXT:    addi a4, a4, -1
+; RV32ZBSNOZBB-NEXT:    and a2, a5, a2
 ; RV32ZBSNOZBB-NEXT:    and a3, a4, a3
 ; RV32ZBSNOZBB-NEXT:    not a3, a3
 ; RV32ZBSNOZBB-NEXT:    not a2, a2
@@ -84,13 +84,13 @@ define i64 @bclr_i64(i64 %a, i64 %b) nounwind {
 ; RV32ZBSZBB-LABEL: bclr_i64:
 ; RV32ZBSZBB:       # %bb.0:
 ; RV32ZBSZBB-NEXT:    andi a3, a2, 63
+; RV32ZBSZBB-NEXT:    bset a2, zero, a2
 ; RV32ZBSZBB-NEXT:    bset a4, zero, a3
 ; RV32ZBSZBB-NEXT:    addi a3, a3, -32
 ; RV32ZBSZBB-NEXT:    slti a3, a3, 0
 ; RV32ZBSZBB-NEXT:    addi a5, a3, -1
-; RV32ZBSZBB-NEXT:    and a4, a5, a4
 ; RV32ZBSZBB-NEXT:    neg a3, a3
-; RV32ZBSZBB-NEXT:    bset a2, zero, a2
+; RV32ZBSZBB-NEXT:    and a4, a5, a4
 ; RV32ZBSZBB-NEXT:    and a2, a3, a2
 ; RV32ZBSZBB-NEXT:    andn a0, a0, a2
 ; RV32ZBSZBB-NEXT:    andn a1, a1, a4
@@ -187,24 +187,24 @@ define signext i64 @bset_i64_zero(i64 signext %a) nounwind {
 ; RV32I-LABEL: bset_i64_zero:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a1, a0, -32
+; RV32I-NEXT:    li a2, 1
 ; RV32I-NEXT:    slti a1, a1, 0
-; RV32I-NEXT:    neg a2, a1
-; RV32I-NEXT:    li a3, 1
-; RV32I-NEXT:    sll a3, a3, a0
-; RV32I-NEXT:    and a0, a2, a3
+; RV32I-NEXT:    sll a2, a2, a0
+; RV32I-NEXT:    neg a0, a1
 ; RV32I-NEXT:    addi a1, a1, -1
-; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBS-LABEL: bset_i64_zero:
 ; RV32ZBS:       # %bb.0:
 ; RV32ZBS-NEXT:    addi a1, a0, -32
-; RV32ZBS-NEXT:    slti a1, a1, 0
-; RV32ZBS-NEXT:    neg a2, a1
-; RV32ZBS-NEXT:    bset a3, zero, a0
-; RV32ZBS-NEXT:    and a0, a2, a3
-; RV32ZBS-NEXT:    addi a1, a1, -1
-; RV32ZBS-NEXT:    and a1, a1, a3
+; RV32ZBS-NEXT:    bset a2, zero, a0
+; RV32ZBS-NEXT:    slti a0, a1, 0
+; RV32ZBS-NEXT:    neg a1, a0
+; RV32ZBS-NEXT:    addi a3, a0, -1
+; RV32ZBS-NEXT:    and a0, a1, a2
+; RV32ZBS-NEXT:    and a1, a3, a2
 ; RV32ZBS-NEXT:    ret
   %shl = shl i64 1, %a
   ret i64 %shl

diff  --git a/llvm/test/CodeGen/RISCV/rv64-double-convert.ll b/llvm/test/CodeGen/RISCV/rv64-double-convert.ll
index 315bf86046dff5..dd49d9e3e2dce1 100644
--- a/llvm/test/CodeGen/RISCV/rv64-double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-double-convert.ll
@@ -73,13 +73,13 @@ define i128 @fptosi_sat_f64_to_i128(double %a) nounwind {
 ; RV64I-NEXT:    li a1, -449
 ; RV64I-NEXT:    slli a1, a1, 53
 ; RV64I-NEXT:    call __gedf2
-; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __fixdfti
-; RV64I-NEXT:    mv s2, a0
+; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    mv s3, a1
 ; RV64I-NEXT:    li s5, -1
-; RV64I-NEXT:    bgez s1, .LBB4_2
+; RV64I-NEXT:    bgez s2, .LBB4_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    slli s3, s5, 63
 ; RV64I-NEXT:  .LBB4_2:
@@ -97,14 +97,14 @@ define i128 @fptosi_sat_f64_to_i128(double %a) nounwind {
 ; RV64I-NEXT:    mv a1, s0
 ; RV64I-NEXT:    call __unorddf2
 ; RV64I-NEXT:    snez a0, a0
+; RV64I-NEXT:    slti a1, s2, 0
+; RV64I-NEXT:    sgtz a2, s4
 ; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    addi a3, a1, -1
 ; RV64I-NEXT:    and a1, a0, s3
-; RV64I-NEXT:    slti a2, s1, 0
-; RV64I-NEXT:    addi a2, a2, -1
-; RV64I-NEXT:    and a2, a2, s2
-; RV64I-NEXT:    sgtz a3, s4
-; RV64I-NEXT:    neg a3, a3
-; RV64I-NEXT:    or a2, a3, a2
+; RV64I-NEXT:    and a3, a3, s1
+; RV64I-NEXT:    neg a2, a2
+; RV64I-NEXT:    or a2, a2, a3
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -140,11 +140,11 @@ define i128 @fptosi_sat_f64_to_i128(double %a) nounwind {
 ; RV64ID-NEXT:    srli a1, a2, 1
 ; RV64ID-NEXT:  .LBB4_4:
 ; RV64ID-NEXT:    feq.d a2, fs0, fs0
-; RV64ID-NEXT:    neg a2, a2
-; RV64ID-NEXT:    and a1, a2, a1
 ; RV64ID-NEXT:    neg a3, a3
 ; RV64ID-NEXT:    neg a4, s0
+; RV64ID-NEXT:    neg a2, a2
 ; RV64ID-NEXT:    and a0, a4, a0
+; RV64ID-NEXT:    and a1, a2, a1
 ; RV64ID-NEXT:    or a0, a3, a0
 ; RV64ID-NEXT:    and a0, a2, a0
 ; RV64ID-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -178,11 +178,11 @@ define i128 @fptosi_sat_f64_to_i128(double %a) nounwind {
 ; RV64IDINX-NEXT:    srli a1, a2, 1
 ; RV64IDINX-NEXT:  .LBB4_4:
 ; RV64IDINX-NEXT:    feq.d a2, s0, s0
-; RV64IDINX-NEXT:    neg a2, a2
-; RV64IDINX-NEXT:    and a1, a2, a1
 ; RV64IDINX-NEXT:    neg a3, a3
 ; RV64IDINX-NEXT:    neg a4, s1
+; RV64IDINX-NEXT:    neg a2, a2
 ; RV64IDINX-NEXT:    and a0, a4, a0
+; RV64IDINX-NEXT:    and a1, a2, a1
 ; RV64IDINX-NEXT:    or a0, a3, a0
 ; RV64IDINX-NEXT:    and a0, a2, a0
 ; RV64IDINX-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -219,10 +219,10 @@ define i128 @fptoui_sat_f64_to_i128(double %a) nounwind {
 ; RV64I-NEXT:    mv a0, s0
 ; RV64I-NEXT:    call __gtdf2
 ; RV64I-NEXT:    sgtz a0, a0
-; RV64I-NEXT:    neg a1, a0
-; RV64I-NEXT:    or a0, a1, s3
-; RV64I-NEXT:    and a2, s2, s1
-; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    and a1, s2, s1
+; RV64I-NEXT:    neg a2, a0
+; RV64I-NEXT:    or a0, a2, s3
+; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -245,10 +245,10 @@ define i128 @fptoui_sat_f64_to_i128(double %a) nounwind {
 ; RV64ID-NEXT:    lui a2, %hi(.LCPI5_0)
 ; RV64ID-NEXT:    fld fa5, %lo(.LCPI5_0)(a2)
 ; RV64ID-NEXT:    and a0, s0, a0
+; RV64ID-NEXT:    and a1, s0, a1
 ; RV64ID-NEXT:    flt.d a2, fa5, fs0
 ; RV64ID-NEXT:    neg a2, a2
 ; RV64ID-NEXT:    or a0, a2, a0
-; RV64ID-NEXT:    and a1, s0, a1
 ; RV64ID-NEXT:    or a1, a2, a1
 ; RV64ID-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64ID-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -270,10 +270,10 @@ define i128 @fptoui_sat_f64_to_i128(double %a) nounwind {
 ; RV64IDINX-NEXT:    lui a2, %hi(.LCPI5_0)
 ; RV64IDINX-NEXT:    ld a2, %lo(.LCPI5_0)(a2)
 ; RV64IDINX-NEXT:    and a0, s1, a0
+; RV64IDINX-NEXT:    and a1, s1, a1
 ; RV64IDINX-NEXT:    flt.d a2, a2, s0
 ; RV64IDINX-NEXT:    neg a2, a2
 ; RV64IDINX-NEXT:    or a0, a2, a0
-; RV64IDINX-NEXT:    and a1, s1, a1
 ; RV64IDINX-NEXT:    or a1, a2, a1
 ; RV64IDINX-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64IDINX-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/rv64-float-convert.ll b/llvm/test/CodeGen/RISCV/rv64-float-convert.ll
index 8ebb9433bad79a..0cdd92fbaf916b 100644
--- a/llvm/test/CodeGen/RISCV/rv64-float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-float-convert.ll
@@ -133,14 +133,14 @@ define i128 @fptosi_sat_f32_to_i128(float %a) nounwind {
 ; RV64I-NEXT:    mv a1, s0
 ; RV64I-NEXT:    call __unordsf2
 ; RV64I-NEXT:    snez a0, a0
+; RV64I-NEXT:    slti a1, s1, 0
+; RV64I-NEXT:    sgtz a2, s4
 ; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    addi a3, a1, -1
 ; RV64I-NEXT:    and a1, a0, s3
-; RV64I-NEXT:    slti a2, s1, 0
-; RV64I-NEXT:    addi a2, a2, -1
-; RV64I-NEXT:    and a2, a2, s2
-; RV64I-NEXT:    sgtz a3, s4
-; RV64I-NEXT:    neg a3, a3
-; RV64I-NEXT:    or a2, a3, a2
+; RV64I-NEXT:    and a3, a3, s2
+; RV64I-NEXT:    neg a2, a2
+; RV64I-NEXT:    or a2, a2, a3
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -176,11 +176,11 @@ define i128 @fptosi_sat_f32_to_i128(float %a) nounwind {
 ; RV64IF-NEXT:    srli a1, a3, 1
 ; RV64IF-NEXT:  .LBB4_4:
 ; RV64IF-NEXT:    feq.s a3, fs0, fs0
-; RV64IF-NEXT:    neg a3, a3
-; RV64IF-NEXT:    and a1, a3, a1
 ; RV64IF-NEXT:    neg a4, s0
-; RV64IF-NEXT:    and a0, a4, a0
 ; RV64IF-NEXT:    neg a2, a2
+; RV64IF-NEXT:    neg a3, a3
+; RV64IF-NEXT:    and a0, a4, a0
+; RV64IF-NEXT:    and a1, a3, a1
 ; RV64IF-NEXT:    or a0, a2, a0
 ; RV64IF-NEXT:    and a0, a3, a0
 ; RV64IF-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -213,11 +213,11 @@ define i128 @fptosi_sat_f32_to_i128(float %a) nounwind {
 ; RV64IZFINX-NEXT:    srli a1, a2, 1
 ; RV64IZFINX-NEXT:  .LBB4_4:
 ; RV64IZFINX-NEXT:    feq.s a2, s0, s0
-; RV64IZFINX-NEXT:    neg a2, a2
-; RV64IZFINX-NEXT:    and a1, a2, a1
 ; RV64IZFINX-NEXT:    neg a4, s1
-; RV64IZFINX-NEXT:    and a0, a4, a0
 ; RV64IZFINX-NEXT:    neg a3, a3
+; RV64IZFINX-NEXT:    neg a2, a2
+; RV64IZFINX-NEXT:    and a0, a4, a0
+; RV64IZFINX-NEXT:    and a1, a2, a1
 ; RV64IZFINX-NEXT:    or a0, a3, a0
 ; RV64IZFINX-NEXT:    and a0, a2, a0
 ; RV64IZFINX-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -252,8 +252,8 @@ define i128 @fptoui_sat_f32_to_i128(float %a) nounwind {
 ; RV64I-NEXT:    sext.w a0, s0
 ; RV64I-NEXT:    call __fixunssfti
 ; RV64I-NEXT:    and a0, s2, a0
-; RV64I-NEXT:    or a0, s1, a0
 ; RV64I-NEXT:    and a1, s2, a1
+; RV64I-NEXT:    or a0, s1, a0
 ; RV64I-NEXT:    or a1, s1, a1
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -276,10 +276,10 @@ define i128 @fptoui_sat_f32_to_i128(float %a) nounwind {
 ; RV64IF-NEXT:    lui a2, %hi(.LCPI5_0)
 ; RV64IF-NEXT:    flw fa5, %lo(.LCPI5_0)(a2)
 ; RV64IF-NEXT:    and a0, s0, a0
+; RV64IF-NEXT:    and a1, s0, a1
 ; RV64IF-NEXT:    flt.s a2, fa5, fs0
 ; RV64IF-NEXT:    neg a2, a2
 ; RV64IF-NEXT:    or a0, a2, a0
-; RV64IF-NEXT:    and a1, s0, a1
 ; RV64IF-NEXT:    or a1, a2, a1
 ; RV64IF-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64IF-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -300,11 +300,11 @@ define i128 @fptoui_sat_f32_to_i128(float %a) nounwind {
 ; RV64IZFINX-NEXT:    call __fixunssfti
 ; RV64IZFINX-NEXT:    and a0, s1, a0
 ; RV64IZFINX-NEXT:    lui a2, 522240
+; RV64IZFINX-NEXT:    and a1, s1, a1
 ; RV64IZFINX-NEXT:    addiw a2, a2, -1
 ; RV64IZFINX-NEXT:    flt.s a2, a2, s0
 ; RV64IZFINX-NEXT:    neg a2, a2
 ; RV64IZFINX-NEXT:    or a0, a2, a0
-; RV64IZFINX-NEXT:    and a1, s1, a1
 ; RV64IZFINX-NEXT:    or a1, a2, a1
 ; RV64IZFINX-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64IZFINX-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/rv64-half-convert.ll b/llvm/test/CodeGen/RISCV/rv64-half-convert.ll
index 2cb2ecbd57f65c..a717c6c71f2ec0 100644
--- a/llvm/test/CodeGen/RISCV/rv64-half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-half-convert.ll
@@ -174,14 +174,14 @@ define i128 @fptosi_sat_f16_to_i128(half %a) nounwind {
 ; RV64I-NEXT:    mv a1, s1
 ; RV64I-NEXT:    call __unordsf2
 ; RV64I-NEXT:    snez a0, a0
+; RV64I-NEXT:    sgtz a1, s4
+; RV64I-NEXT:    slti a2, s0, 0
 ; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    neg a3, a1
+; RV64I-NEXT:    addi a2, a2, -1
 ; RV64I-NEXT:    and a1, a0, s3
-; RV64I-NEXT:    sgtz a2, s4
-; RV64I-NEXT:    neg a2, a2
-; RV64I-NEXT:    slti a3, s0, 0
-; RV64I-NEXT:    addi a3, a3, -1
-; RV64I-NEXT:    and a3, a3, s2
-; RV64I-NEXT:    or a2, a2, a3
+; RV64I-NEXT:    and a2, a2, s2
+; RV64I-NEXT:    or a2, a3, a2
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
@@ -218,11 +218,11 @@ define i128 @fptosi_sat_f16_to_i128(half %a) nounwind {
 ; RV64IZFH-NEXT:    srli a1, a2, 1
 ; RV64IZFH-NEXT:  .LBB4_4:
 ; RV64IZFH-NEXT:    feq.s a2, fs0, fs0
-; RV64IZFH-NEXT:    neg a2, a2
-; RV64IZFH-NEXT:    and a1, a2, a1
 ; RV64IZFH-NEXT:    neg a3, a3
 ; RV64IZFH-NEXT:    neg a4, s0
+; RV64IZFH-NEXT:    neg a2, a2
 ; RV64IZFH-NEXT:    and a0, a4, a0
+; RV64IZFH-NEXT:    and a1, a2, a1
 ; RV64IZFH-NEXT:    or a0, a3, a0
 ; RV64IZFH-NEXT:    and a0, a2, a0
 ; RV64IZFH-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -255,11 +255,11 @@ define i128 @fptosi_sat_f16_to_i128(half %a) nounwind {
 ; RV64IZHINX-NEXT:    srli a1, a2, 1
 ; RV64IZHINX-NEXT:  .LBB4_4:
 ; RV64IZHINX-NEXT:    feq.s a2, s0, s0
-; RV64IZHINX-NEXT:    neg a2, a2
-; RV64IZHINX-NEXT:    and a1, a2, a1
 ; RV64IZHINX-NEXT:    neg a3, a3
 ; RV64IZHINX-NEXT:    neg a4, s1
+; RV64IZHINX-NEXT:    neg a2, a2
 ; RV64IZHINX-NEXT:    and a0, a4, a0
+; RV64IZHINX-NEXT:    and a1, a2, a1
 ; RV64IZHINX-NEXT:    or a0, a3, a0
 ; RV64IZHINX-NEXT:    and a0, a2, a0
 ; RV64IZHINX-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -295,8 +295,8 @@ define i128 @fptoui_sat_f16_to_i128(half %a) nounwind {
 ; RV64I-NEXT:    sext.w a0, s0
 ; RV64I-NEXT:    call __fixunssfti
 ; RV64I-NEXT:    and a0, s2, a0
-; RV64I-NEXT:    or a0, s1, a0
 ; RV64I-NEXT:    and a1, s2, a1
+; RV64I-NEXT:    or a0, s1, a0
 ; RV64I-NEXT:    or a1, s1, a1
 ; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -314,15 +314,15 @@ define i128 @fptoui_sat_f16_to_i128(half %a) nounwind {
 ; RV64IZFH-NEXT:    lui a0, %hi(.LCPI5_0)
 ; RV64IZFH-NEXT:    flw fa5, %lo(.LCPI5_0)(a0)
 ; RV64IZFH-NEXT:    fcvt.s.h fa0, fa0
-; RV64IZFH-NEXT:    flt.s a0, fa5, fa0
-; RV64IZFH-NEXT:    neg s0, a0
-; RV64IZFH-NEXT:    fmv.w.x fa5, zero
-; RV64IZFH-NEXT:    fle.s a0, fa5, fa0
+; RV64IZFH-NEXT:    fmv.w.x fa4, zero
+; RV64IZFH-NEXT:    fle.s a0, fa4, fa0
+; RV64IZFH-NEXT:    flt.s a1, fa5, fa0
+; RV64IZFH-NEXT:    neg s0, a1
 ; RV64IZFH-NEXT:    neg s1, a0
 ; RV64IZFH-NEXT:    call __fixunssfti
 ; RV64IZFH-NEXT:    and a0, s1, a0
-; RV64IZFH-NEXT:    or a0, s0, a0
 ; RV64IZFH-NEXT:    and a1, s1, a1
+; RV64IZFH-NEXT:    or a0, s0, a0
 ; RV64IZFH-NEXT:    or a1, s0, a1
 ; RV64IZFH-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64IZFH-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -339,14 +339,14 @@ define i128 @fptoui_sat_f16_to_i128(half %a) nounwind {
 ; RV64IZHINX-NEXT:    fcvt.s.h a0, a0
 ; RV64IZHINX-NEXT:    lui a1, 522240
 ; RV64IZHINX-NEXT:    addiw a1, a1, -1
+; RV64IZHINX-NEXT:    fle.s a2, zero, a0
 ; RV64IZHINX-NEXT:    flt.s a1, a1, a0
 ; RV64IZHINX-NEXT:    neg s0, a1
-; RV64IZHINX-NEXT:    fle.s a1, zero, a0
-; RV64IZHINX-NEXT:    neg s1, a1
+; RV64IZHINX-NEXT:    neg s1, a2
 ; RV64IZHINX-NEXT:    call __fixunssfti
 ; RV64IZHINX-NEXT:    and a0, s1, a0
-; RV64IZHINX-NEXT:    or a0, s0, a0
 ; RV64IZHINX-NEXT:    and a1, s1, a1
+; RV64IZHINX-NEXT:    or a0, s0, a0
 ; RV64IZHINX-NEXT:    or a1, s0, a1
 ; RV64IZHINX-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; RV64IZHINX-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
index ba184063265098..1ec4d8ddd1d84e 100644
--- a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
@@ -18,15 +18,15 @@ define i64 @test0(i64 %n, ptr %p) nounwind {
 ; RV64-NEXT:    mv s0, a0
 ; RV64-NEXT:    lui a0, %hi(f)
 ; RV64-NEXT:    addi a0, a0, %lo(f)
+; RV64-NEXT:    li a2, 919
+; RV64-NEXT:    lui a3, %hi(.LCPI0_0)
 ; RV64-NEXT:    sd a0, 32(sp)
-; RV64-NEXT:    li a0, 919
-; RV64-NEXT:    lui a2, %hi(.LCPI0_0)
-; RV64-NEXT:    ld a2, %lo(.LCPI0_0)(a2)
-; RV64-NEXT:    lui a3, 6203
-; RV64-NEXT:    addi a3, a3, 643
-; RV64-NEXT:    sw a0, 8(sp)
-; RV64-NEXT:    sw a3, 12(sp)
-; RV64-NEXT:    sd a2, 16(sp)
+; RV64-NEXT:    lui a0, 6203
+; RV64-NEXT:    ld a3, %lo(.LCPI0_0)(a3)
+; RV64-NEXT:    addi a0, a0, 643
+; RV64-NEXT:    sw a2, 8(sp)
+; RV64-NEXT:    sw a0, 12(sp)
+; RV64-NEXT:    sd a3, 16(sp)
 ; RV64-NEXT:    sd a1, 24(sp)
 ; RV64-NEXT:    addi a1, sp, 24
 ; RV64-NEXT:    addi a0, sp, 8
@@ -49,15 +49,15 @@ define i64 @test0(i64 %n, ptr %p) nounwind {
 ; RV64-LINUX-NEXT:    mv s0, a0
 ; RV64-LINUX-NEXT:    lui a0, %hi(f)
 ; RV64-LINUX-NEXT:    addi a0, a0, %lo(f)
+; RV64-LINUX-NEXT:    li a2, 919
+; RV64-LINUX-NEXT:    lui a3, %hi(.LCPI0_0)
 ; RV64-LINUX-NEXT:    sd a0, 32(sp)
-; RV64-LINUX-NEXT:    li a0, 919
-; RV64-LINUX-NEXT:    lui a2, %hi(.LCPI0_0)
-; RV64-LINUX-NEXT:    ld a2, %lo(.LCPI0_0)(a2)
-; RV64-LINUX-NEXT:    lui a3, 6203
-; RV64-LINUX-NEXT:    addi a3, a3, 643
-; RV64-LINUX-NEXT:    sw a0, 8(sp)
-; RV64-LINUX-NEXT:    sw a3, 12(sp)
-; RV64-LINUX-NEXT:    sd a2, 16(sp)
+; RV64-LINUX-NEXT:    lui a0, 6203
+; RV64-LINUX-NEXT:    ld a3, %lo(.LCPI0_0)(a3)
+; RV64-LINUX-NEXT:    addi a0, a0, 643
+; RV64-LINUX-NEXT:    sw a2, 8(sp)
+; RV64-LINUX-NEXT:    sw a0, 12(sp)
+; RV64-LINUX-NEXT:    sd a3, 16(sp)
 ; RV64-LINUX-NEXT:    sd a1, 24(sp)
 ; RV64-LINUX-NEXT:    addi a1, sp, 24
 ; RV64-LINUX-NEXT:    addi a0, sp, 8

diff  --git a/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll b/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
index 6c4466796aeedd..b3c22a5322cb4c 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-shift-sext.ll
@@ -175,10 +175,10 @@ define i8 @test13(ptr %0, i64 %1) {
 ; RV64I-LABEL: test13:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a2, 1
-; RV64I-NEXT:    subw a2, a2, a1
-; RV64I-NEXT:    add a2, a0, a2
 ; RV64I-NEXT:    li a3, 2
+; RV64I-NEXT:    subw a2, a2, a1
 ; RV64I-NEXT:    subw a3, a3, a1
+; RV64I-NEXT:    add a2, a0, a2
 ; RV64I-NEXT:    add a0, a0, a3
 ; RV64I-NEXT:    lbu a1, 0(a2)
 ; RV64I-NEXT:    lbu a0, 0(a0)
@@ -203,8 +203,8 @@ define signext i32 @test14(ptr %0, ptr %1, i64 %2) {
 ; RV64I-NEXT:    li a3, 1
 ; RV64I-NEXT:    subw a3, a3, a2
 ; RV64I-NEXT:    add a0, a0, a3
-; RV64I-NEXT:    lbu a0, 0(a0)
 ; RV64I-NEXT:    slli a3, a3, 2
+; RV64I-NEXT:    lbu a0, 0(a0)
 ; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    lw a1, 0(a1)
 ; RV64I-NEXT:    addw a0, a0, a1

diff  --git a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
index ab1691543c78af..0782018833de30 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
@@ -7,11 +7,11 @@ define signext i32 @addw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin
 ; CHECK-NEXT:    bge a0, a1, .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    not a2, a0
-; CHECK-NEXT:    add a2, a2, a1
 ; CHECK-NEXT:    addi a3, a0, 1
-; CHECK-NEXT:    mul a3, a2, a3
+; CHECK-NEXT:    add a2, a2, a1
 ; CHECK-NEXT:    subw a1, a1, a0
 ; CHECK-NEXT:    addi a1, a1, -2
+; CHECK-NEXT:    mul a3, a2, a3
 ; CHECK-NEXT:    slli a1, a1, 32
 ; CHECK-NEXT:    slli a2, a2, 32
 ; CHECK-NEXT:    mulhu a1, a2, a1
@@ -53,13 +53,13 @@ define signext i32 @subw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin
 ; CHECK-NEXT:    bge a0, a1, .LBB1_2
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    not a2, a0
-; CHECK-NEXT:    add a3, a2, a1
-; CHECK-NEXT:    mul a2, a3, a2
-; CHECK-NEXT:    subw a1, a1, a0
-; CHECK-NEXT:    addi a1, a1, -2
-; CHECK-NEXT:    slli a1, a1, 32
+; CHECK-NEXT:    subw a3, a1, a0
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    addi a3, a3, -2
+; CHECK-NEXT:    mul a2, a1, a2
 ; CHECK-NEXT:    slli a3, a3, 32
-; CHECK-NEXT:    mulhu a1, a3, a1
+; CHECK-NEXT:    slli a1, a1, 32
+; CHECK-NEXT:    mulhu a1, a1, a3
 ; CHECK-NEXT:    srli a1, a1, 1
 ; CHECK-NEXT:    subw a0, a2, a0
 ; CHECK-NEXT:    subw a0, a0, a1

diff  --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
index 47c4e8beecced0..d9f7d361272934 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
@@ -12,31 +12,31 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    beqz a0, .LBB0_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -64,31 +64,31 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    beqz a0, .LBB1_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -125,31 +125,31 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    beqz a1, .LBB2_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a2, a1, 1
+; RV64I-NEXT:    lui a3, 349525
 ; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 2
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 4
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 8
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 16
-; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    addiw a2, a3, 1365
+; RV64I-NEXT:    srliw a3, a1, 2
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 4
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 8
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 16
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    not a1, a1
-; RV64I-NEXT:    srli a2, a1, 1
-; RV64I-NEXT:    lui a3, 349525
-; RV64I-NEXT:    addiw a3, a3, 1365
-; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    srli a3, a1, 1
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    addiw a3, a3, 819
 ; RV64I-NEXT:    sub a1, a1, a2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a3, a1, a2
+; RV64I-NEXT:    and a2, a1, a3
 ; RV64I-NEXT:    srli a1, a1, 2
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    add a1, a3, a1
+; RV64I-NEXT:    and a1, a1, a3
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    add a1, a2, a1
 ; RV64I-NEXT:    srli a2, a1, 4
 ; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    addi a2, a2, -241
+; RV64I-NEXT:    addi a2, a3, -241
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a2, a1, 8
 ; RV64I-NEXT:    add a1, a1, a2
@@ -179,39 +179,39 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: findLastSet_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a1, a0, a1
-; RV64I-NEXT:    srliw a2, a1, 2
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 4
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 8
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 16
-; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    srliw a3, a1, 2
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 4
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 8
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 16
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    not a1, a1
-; RV64I-NEXT:    srli a2, a1, 1
-; RV64I-NEXT:    lui a3, 349525
-; RV64I-NEXT:    addiw a3, a3, 1365
-; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    srli a3, a1, 1
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    addiw a3, a3, 819
 ; RV64I-NEXT:    sub a1, a1, a2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a3, a1, a2
+; RV64I-NEXT:    and a2, a1, a3
 ; RV64I-NEXT:    srli a1, a1, 2
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    add a1, a3, a1
+; RV64I-NEXT:    and a1, a1, a3
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    snez a0, a0
+; RV64I-NEXT:    addi a3, a3, -241
+; RV64I-NEXT:    add a1, a2, a1
 ; RV64I-NEXT:    srli a2, a1, 4
 ; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    addi a2, a2, -241
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    and a1, a1, a3
 ; RV64I-NEXT:    slli a2, a1, 8
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    slli a2, a1, 16
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    srliw a1, a1, 24
 ; RV64I-NEXT:    xori a1, a1, 31
-; RV64I-NEXT:    snez a0, a0
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -219,10 +219,10 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64XTHEADBB-LABEL: findLastSet_i32:
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    not a1, a0
+; RV64XTHEADBB-NEXT:    snez a0, a0
 ; RV64XTHEADBB-NEXT:    slli a1, a1, 32
 ; RV64XTHEADBB-NEXT:    th.ff0 a1, a1
 ; RV64XTHEADBB-NEXT:    xori a1, a1, 31
-; RV64XTHEADBB-NEXT:    snez a0, a0
 ; RV64XTHEADBB-NEXT:    addi a0, a0, -1
 ; RV64XTHEADBB-NEXT:    or a0, a0, a1
 ; RV64XTHEADBB-NEXT:    ret
@@ -240,31 +240,31 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    beqz a0, .LBB4_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -296,40 +296,40 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    beqz a0, .LBB5_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 32
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addiw a2, a3, 819
+; RV64I-NEXT:    srli a3, a0, 2
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    slli a3, a2, 32
 ; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a3, a0, 4
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 16
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 32
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    addiw a3, a3, -241
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    slli a2, a3, 32
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a2, a3, a2
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 16
@@ -456,10 +456,10 @@ define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
 ; RV64XTHEADBB-NEXT:    addi a1, a0, -1
 ; RV64XTHEADBB-NEXT:    not a2, a0
 ; RV64XTHEADBB-NEXT:    and a1, a2, a1
-; RV64XTHEADBB-NEXT:    th.ff1 a1, a1
 ; RV64XTHEADBB-NEXT:    li a2, 64
-; RV64XTHEADBB-NEXT:    sub a2, a2, a1
 ; RV64XTHEADBB-NEXT:    snez a0, a0
+; RV64XTHEADBB-NEXT:    th.ff1 a1, a1
+; RV64XTHEADBB-NEXT:    sub a2, a2, a1
 ; RV64XTHEADBB-NEXT:    addi a0, a0, -1
 ; RV64XTHEADBB-NEXT:    or a0, a0, a2
 ; RV64XTHEADBB-NEXT:    ret
@@ -486,8 +486,8 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi a1, a1, %lo(.LCPI9_0)
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    lbu a0, 0(a0)
-; RV64I-NEXT:    addi a0, a0, 1
 ; RV64I-NEXT:    seqz a1, s0
+; RV64I-NEXT:    addi a0, a0, 1
 ; RV64I-NEXT:    addi a1, a1, -1
 ; RV64I-NEXT:    and a0, a1, a0
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -500,10 +500,10 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64XTHEADBB-NEXT:    addi a1, a0, -1
 ; RV64XTHEADBB-NEXT:    not a2, a0
 ; RV64XTHEADBB-NEXT:    and a1, a2, a1
-; RV64XTHEADBB-NEXT:    th.ff1 a1, a1
 ; RV64XTHEADBB-NEXT:    li a2, 65
-; RV64XTHEADBB-NEXT:    sub a2, a2, a1
 ; RV64XTHEADBB-NEXT:    seqz a0, a0
+; RV64XTHEADBB-NEXT:    th.ff1 a1, a1
+; RV64XTHEADBB-NEXT:    sub a2, a2, a1
 ; RV64XTHEADBB-NEXT:    addi a0, a0, -1
 ; RV64XTHEADBB-NEXT:    and a0, a0, a2
 ; RV64XTHEADBB-NEXT:    ret
@@ -802,11 +802,11 @@ define signext i32 @bswap_i32(i32 signext %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    srliw a3, a0, 24
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    and a2, a0, a2
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    slliw a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a2
@@ -827,11 +827,11 @@ define void @bswap_i32_nosext(i32 signext %a, ptr %x) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a2, a0, 8
 ; RV64I-NEXT:    lui a3, 16
+; RV64I-NEXT:    srliw a4, a0, 24
 ; RV64I-NEXT:    addi a3, a3, -256
 ; RV64I-NEXT:    and a2, a2, a3
-; RV64I-NEXT:    srliw a4, a0, 24
-; RV64I-NEXT:    or a2, a2, a4
 ; RV64I-NEXT:    and a3, a0, a3
+; RV64I-NEXT:    or a2, a2, a4
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a3
@@ -856,28 +856,28 @@ define i64 @bswap_i64(i64 %a) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a0, 40
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    srli a3, a0, 56
+; RV64I-NEXT:    srli a4, a0, 24
+; RV64I-NEXT:    lui a5, 4080
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srli a3, a0, 56
 ; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    srli a3, a0, 24
-; RV64I-NEXT:    lui a4, 4080
-; RV64I-NEXT:    and a3, a3, a4
-; RV64I-NEXT:    srli a5, a0, 8
-; RV64I-NEXT:    srliw a5, a5, 24
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    and a4, a0, a4
-; RV64I-NEXT:    slli a4, a4, 24
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    and a4, a4, a5
+; RV64I-NEXT:    srliw a3, a3, 24
+; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    srliw a4, a0, 24
+; RV64I-NEXT:    and a5, a0, a5
 ; RV64I-NEXT:    and a2, a0, a2
-; RV64I-NEXT:    slli a2, a2, 40
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a2, a2, 40
+; RV64I-NEXT:    or a1, a3, a1
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 07726b643b51ad..9760821832b375 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -2656,8 +2656,8 @@ define i64 @array_index_lshr_sh3_sh3(ptr %p, i64 %idx1, i64 %idx2) {
 ; RV64I-LABEL: array_index_lshr_sh3_sh3:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a1, 58
-; RV64I-NEXT:    slli a1, a1, 6
 ; RV64I-NEXT:    slli a2, a2, 3
+; RV64I-NEXT:    slli a1, a1, 6
 ; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    ld a0, 0(a0)
@@ -2759,8 +2759,8 @@ define ptr @test_gep_gep_dont_crash(ptr %p, i64 %a1, i64 %a2) {
 ; RV64I-LABEL: test_gep_gep_dont_crash:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srliw a2, a2, 6
-; RV64I-NEXT:    slli a2, a2, 3
 ; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    slli a2, a2, 3
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    add a0, a0, a2
 ; RV64I-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rv64zbb-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64zbb-intrinsic.ll
index a7af8ab348e99e..3f984deccfb2c7 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb-intrinsic.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb-intrinsic.ll
@@ -59,12 +59,12 @@ define i64 @orcb64_knownbits(i64 %a) nounwind {
 ; RV64ZBB-LABEL: orcb64_knownbits:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    lui a1, 65535
+; RV64ZBB-NEXT:    lui a2, 256
 ; RV64ZBB-NEXT:    slli a1, a1, 12
+; RV64ZBB-NEXT:    addiw a2, a2, 8
 ; RV64ZBB-NEXT:    and a0, a0, a1
-; RV64ZBB-NEXT:    lui a1, 256
-; RV64ZBB-NEXT:    addiw a1, a1, 8
-; RV64ZBB-NEXT:    slli a2, a1, 42
-; RV64ZBB-NEXT:    add a1, a1, a2
+; RV64ZBB-NEXT:    slli a1, a2, 42
+; RV64ZBB-NEXT:    add a1, a2, a1
 ; RV64ZBB-NEXT:    or a0, a0, a1
 ; RV64ZBB-NEXT:    orc.b a0, a0
 ; RV64ZBB-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll
index d9afb7c00ce58f..bf077364c9c7ab 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb-zbkb.ll
@@ -146,10 +146,10 @@ define signext i32 @rol_i32_neg_constant_rhs(i32 signext %a) nounwind {
 ; RV64I-LABEL: rol_i32_neg_constant_rhs:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a1, -2
-; RV64I-NEXT:    sllw a2, a1, a0
-; RV64I-NEXT:    negw a0, a0
-; RV64I-NEXT:    srlw a0, a1, a0
-; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    negw a2, a0
+; RV64I-NEXT:    sllw a0, a1, a0
+; RV64I-NEXT:    srlw a1, a1, a2
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-ZBKB-LABEL: rol_i32_neg_constant_rhs:
@@ -224,10 +224,10 @@ define signext i32 @ror_i32_neg_constant_rhs(i32 signext %a) nounwind {
 ; RV64I-LABEL: ror_i32_neg_constant_rhs:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a1, -2
-; RV64I-NEXT:    srlw a2, a1, a0
-; RV64I-NEXT:    negw a0, a0
-; RV64I-NEXT:    sllw a0, a1, a0
-; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    negw a2, a0
+; RV64I-NEXT:    srlw a0, a1, a0
+; RV64I-NEXT:    sllw a1, a1, a2
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-ZBKB-LABEL: ror_i32_neg_constant_rhs:

diff  --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 1e7814d588e4c0..d67db77c04a8ea 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -12,31 +12,31 @@ define signext i32 @ctlz_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    beqz a0, .LBB0_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -62,31 +62,31 @@ define signext i32 @log2_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    beqz a0, .LBB1_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -121,31 +121,31 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    beqz a1, .LBB2_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a2, a1, 1
+; RV64I-NEXT:    lui a3, 349525
 ; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 2
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 4
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 8
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 16
-; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    addiw a2, a3, 1365
+; RV64I-NEXT:    srliw a3, a1, 2
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 4
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 8
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 16
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    not a1, a1
-; RV64I-NEXT:    srli a2, a1, 1
-; RV64I-NEXT:    lui a3, 349525
-; RV64I-NEXT:    addiw a3, a3, 1365
-; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    srli a3, a1, 1
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    addiw a3, a3, 819
 ; RV64I-NEXT:    sub a1, a1, a2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a3, a1, a2
+; RV64I-NEXT:    and a2, a1, a3
 ; RV64I-NEXT:    srli a1, a1, 2
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    add a1, a3, a1
+; RV64I-NEXT:    and a1, a1, a3
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    add a1, a2, a1
 ; RV64I-NEXT:    srli a2, a1, 4
 ; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    addi a2, a2, -241
+; RV64I-NEXT:    addi a2, a3, -241
 ; RV64I-NEXT:    and a1, a1, a2
 ; RV64I-NEXT:    slli a2, a1, 8
 ; RV64I-NEXT:    add a1, a1, a2
@@ -173,39 +173,39 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64I-LABEL: findLastSet_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a1, a0, a1
-; RV64I-NEXT:    srliw a2, a1, 2
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 4
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 8
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    srliw a2, a1, 16
-; RV64I-NEXT:    or a1, a1, a2
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    srliw a3, a1, 2
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 4
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 8
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srliw a3, a1, 16
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    not a1, a1
-; RV64I-NEXT:    srli a2, a1, 1
-; RV64I-NEXT:    lui a3, 349525
-; RV64I-NEXT:    addiw a3, a3, 1365
-; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    srli a3, a1, 1
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    addiw a3, a3, 819
 ; RV64I-NEXT:    sub a1, a1, a2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a3, a1, a2
+; RV64I-NEXT:    and a2, a1, a3
 ; RV64I-NEXT:    srli a1, a1, 2
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    add a1, a3, a1
+; RV64I-NEXT:    and a1, a1, a3
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    snez a0, a0
+; RV64I-NEXT:    addi a3, a3, -241
+; RV64I-NEXT:    add a1, a2, a1
 ; RV64I-NEXT:    srli a2, a1, 4
 ; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    lui a2, 61681
-; RV64I-NEXT:    addi a2, a2, -241
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    and a1, a1, a3
 ; RV64I-NEXT:    slli a2, a1, 8
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    slli a2, a1, 16
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    srliw a1, a1, 24
 ; RV64I-NEXT:    xori a1, a1, 31
-; RV64I-NEXT:    snez a0, a0
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -213,8 +213,8 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64ZBB-LABEL: findLastSet_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    clzw a1, a0
-; RV64ZBB-NEXT:    xori a1, a1, 31
 ; RV64ZBB-NEXT:    snez a0, a0
+; RV64ZBB-NEXT:    xori a1, a1, 31
 ; RV64ZBB-NEXT:    addi a0, a0, -1
 ; RV64ZBB-NEXT:    or a0, a0, a1
 ; RV64ZBB-NEXT:    ret
@@ -232,31 +232,31 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    beqz a0, .LBB4_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srliw a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -286,40 +286,40 @@ define i64 @ctlz_i64(i64 %a) nounwind {
 ; RV64I-NEXT:    beqz a0, .LBB5_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
 ; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 2
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 4
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 32
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    lui a3, 209715
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    addiw a2, a3, 819
+; RV64I-NEXT:    srli a3, a0, 2
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    slli a3, a2, 32
 ; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a3, a0, 4
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 16
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srli a3, a0, 32
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    addiw a3, a3, -241
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    slli a2, a3, 32
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a2, a3, a2
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 16
@@ -456,8 +456,8 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    addi a1, a1, %lo(.LCPI9_0)
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    lbu a0, 0(a0)
-; RV64I-NEXT:    addi a0, a0, 1
 ; RV64I-NEXT:    seqz a1, s0
+; RV64I-NEXT:    addi a0, a0, 1
 ; RV64I-NEXT:    addi a1, a1, -1
 ; RV64I-NEXT:    and a0, a1, a0
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -468,8 +468,8 @@ define signext i32 @ffs_i32(i32 signext %a) nounwind {
 ; RV64ZBB-LABEL: ffs_i32:
 ; RV64ZBB:       # %bb.0:
 ; RV64ZBB-NEXT:    ctzw a1, a0
-; RV64ZBB-NEXT:    addi a1, a1, 1
 ; RV64ZBB-NEXT:    seqz a0, a0
+; RV64ZBB-NEXT:    addi a1, a1, 1
 ; RV64ZBB-NEXT:    addi a0, a0, -1
 ; RV64ZBB-NEXT:    and a0, a0, a1
 ; RV64ZBB-NEXT:    ret
@@ -523,17 +523,17 @@ define signext i32 @ctpop_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    addiw a2, a2, 1365
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -630,21 +630,21 @@ define signext i32 @ctpop_i32_load(ptr %p) nounwind {
 ; RV64I-LABEL: ctpop_i32_load:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lw a0, 0(a0)
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi a1, a1, -241
+; RV64I-NEXT:    addi a1, a2, -241
 ; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
@@ -670,39 +670,39 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a2, a0, 1
 ; RV64I-NEXT:    lui a3, 349525
+; RV64I-NEXT:    lui a4, 209715
+; RV64I-NEXT:    srli a5, a1, 1
 ; RV64I-NEXT:    addiw a3, a3, 1365
 ; RV64I-NEXT:    and a2, a2, a3
-; RV64I-NEXT:    sub a0, a0, a2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a4, a0, a2
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    add a0, a4, a0
-; RV64I-NEXT:    srli a4, a0, 4
-; RV64I-NEXT:    add a0, a0, a4
-; RV64I-NEXT:    lui a4, 61681
-; RV64I-NEXT:    addi a4, a4, -241
-; RV64I-NEXT:    and a0, a0, a4
-; RV64I-NEXT:    slli a5, a0, 8
-; RV64I-NEXT:    add a0, a0, a5
-; RV64I-NEXT:    slli a5, a0, 16
-; RV64I-NEXT:    add a0, a0, a5
-; RV64I-NEXT:    srliw a0, a0, 24
-; RV64I-NEXT:    srli a5, a1, 1
 ; RV64I-NEXT:    and a3, a5, a3
+; RV64I-NEXT:    lui a5, 61681
+; RV64I-NEXT:    addiw a4, a4, 819
+; RV64I-NEXT:    addi a5, a5, -241
+; RV64I-NEXT:    sub a0, a0, a2
 ; RV64I-NEXT:    sub a1, a1, a3
-; RV64I-NEXT:    and a3, a1, a2
+; RV64I-NEXT:    and a2, a0, a4
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a3, a1, a4
 ; RV64I-NEXT:    srli a1, a1, 2
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    add a1, a3, a1
-; RV64I-NEXT:    srli a2, a1, 4
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    and a0, a0, a4
 ; RV64I-NEXT:    and a1, a1, a4
-; RV64I-NEXT:    slli a2, a1, 8
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    slli a2, a1, 16
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    add a1, a3, a1
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    srli a3, a1, 4
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a1, a1, a3
+; RV64I-NEXT:    and a0, a0, a5
+; RV64I-NEXT:    and a1, a1, a5
+; RV64I-NEXT:    slli a2, a0, 8
+; RV64I-NEXT:    slli a3, a1, 8
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a1, a1, a3
+; RV64I-NEXT:    slli a2, a0, 16
+; RV64I-NEXT:    slli a3, a1, 16
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a1, a1, a3
+; RV64I-NEXT:    srliw a0, a0, 24
 ; RV64I-NEXT:    srliw a1, a1, 24
 ; RV64I-NEXT:    ret
 ;
@@ -720,11 +720,11 @@ define <2 x i1> @ctpop_v2i32_ult_two(<2 x i32> %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a2, a0, -1
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    addi a2, a1, -1
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    sext.w a1, a1
+; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    seqz a1, a1
 ; RV64I-NEXT:    ret
 ;
@@ -745,11 +745,11 @@ define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a2, a0, -1
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    snez a0, a0
 ; RV64I-NEXT:    addi a2, a1, -1
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    sext.w a1, a1
+; RV64I-NEXT:    snez a0, a0
 ; RV64I-NEXT:    snez a1, a1
 ; RV64I-NEXT:    ret
 ;
@@ -758,8 +758,8 @@ define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind {
 ; RV64ZBB-NEXT:    cpopw a1, a1
 ; RV64ZBB-NEXT:    cpopw a0, a0
 ; RV64ZBB-NEXT:    sltiu a0, a0, 2
-; RV64ZBB-NEXT:    xori a0, a0, 1
 ; RV64ZBB-NEXT:    sltiu a1, a1, 2
+; RV64ZBB-NEXT:    xori a0, a0, 1
 ; RV64ZBB-NEXT:    xori a1, a1, 1
 ; RV64ZBB-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
@@ -785,8 +785,8 @@ define <2 x i1> @ctpop_v2i32_eq_one(<2 x i32> %a) nounwind {
 ; RV64ZBB-NEXT:    cpopw a1, a1
 ; RV64ZBB-NEXT:    cpopw a0, a0
 ; RV64ZBB-NEXT:    addi a0, a0, -1
-; RV64ZBB-NEXT:    seqz a0, a0
 ; RV64ZBB-NEXT:    addi a1, a1, -1
+; RV64ZBB-NEXT:    seqz a0, a0
 ; RV64ZBB-NEXT:    seqz a1, a1
 ; RV64ZBB-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
@@ -801,11 +801,11 @@ define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind {
 ; RV64I-NEXT:    xor a0, a0, a2
 ; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    sltu a0, a2, a0
-; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    addiw a2, a1, -1
 ; RV64I-NEXT:    xor a1, a1, a2
 ; RV64I-NEXT:    sext.w a1, a1
 ; RV64I-NEXT:    sltu a1, a2, a1
+; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    xori a1, a1, 1
 ; RV64I-NEXT:    ret
 ;
@@ -814,8 +814,8 @@ define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind {
 ; RV64ZBB-NEXT:    cpopw a1, a1
 ; RV64ZBB-NEXT:    cpopw a0, a0
 ; RV64ZBB-NEXT:    addi a0, a0, -1
-; RV64ZBB-NEXT:    snez a0, a0
 ; RV64ZBB-NEXT:    addi a1, a1, -1
+; RV64ZBB-NEXT:    snez a0, a0
 ; RV64ZBB-NEXT:    snez a1, a1
 ; RV64ZBB-NEXT:    ret
   %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a)
@@ -828,28 +828,28 @@ declare i64 @llvm.ctpop.i64(i64)
 define i64 @ctpop_i64(i64 %a) nounwind {
 ; RV64I-LABEL: ctpop_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    srli a1, a0, 1
-; RV64I-NEXT:    lui a2, 349525
-; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    add a1, a1, a3
 ; RV64I-NEXT:    slli a3, a2, 32
 ; RV64I-NEXT:    add a2, a2, a3
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a3, a0, 1
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    addiw a3, a3, -241
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    slli a2, a3, 32
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a2, a3, a2
+; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    slli a1, a0, 8
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    slli a1, a0, 16
@@ -950,49 +950,49 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a2, a0, 1
 ; RV64I-NEXT:    lui a3, 349525
+; RV64I-NEXT:    lui a4, 209715
+; RV64I-NEXT:    lui a5, 61681
 ; RV64I-NEXT:    addiw a3, a3, 1365
-; RV64I-NEXT:    slli a4, a3, 32
-; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    addiw a4, a4, 819
+; RV64I-NEXT:    addiw a5, a5, -241
+; RV64I-NEXT:    slli a6, a3, 32
+; RV64I-NEXT:    add a3, a3, a6
+; RV64I-NEXT:    slli a6, a4, 32
+; RV64I-NEXT:    add a4, a4, a6
+; RV64I-NEXT:    slli a6, a5, 32
+; RV64I-NEXT:    add a5, a5, a6
+; RV64I-NEXT:    srli a6, a1, 1
 ; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    and a3, a6, a3
 ; RV64I-NEXT:    sub a0, a0, a2
-; RV64I-NEXT:    lui a2, 209715
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    slli a4, a2, 32
-; RV64I-NEXT:    add a2, a2, a4
-; RV64I-NEXT:    and a4, a0, a2
-; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    add a0, a4, a0
-; RV64I-NEXT:    srli a4, a0, 4
-; RV64I-NEXT:    add a0, a0, a4
-; RV64I-NEXT:    lui a4, 61681
-; RV64I-NEXT:    addiw a4, a4, -241
-; RV64I-NEXT:    slli a5, a4, 32
-; RV64I-NEXT:    add a4, a4, a5
-; RV64I-NEXT:    and a0, a0, a4
-; RV64I-NEXT:    slli a5, a0, 8
-; RV64I-NEXT:    add a0, a0, a5
-; RV64I-NEXT:    slli a5, a0, 16
-; RV64I-NEXT:    add a0, a0, a5
-; RV64I-NEXT:    slli a5, a0, 32
-; RV64I-NEXT:    add a0, a0, a5
-; RV64I-NEXT:    srli a0, a0, 56
-; RV64I-NEXT:    srli a5, a1, 1
-; RV64I-NEXT:    and a3, a5, a3
 ; RV64I-NEXT:    sub a1, a1, a3
-; RV64I-NEXT:    and a3, a1, a2
+; RV64I-NEXT:    and a2, a0, a4
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a3, a1, a4
 ; RV64I-NEXT:    srli a1, a1, 2
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    add a1, a3, a1
-; RV64I-NEXT:    srli a2, a1, 4
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    and a0, a0, a4
 ; RV64I-NEXT:    and a1, a1, a4
-; RV64I-NEXT:    slli a2, a1, 8
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    slli a2, a1, 16
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    add a1, a3, a1
+; RV64I-NEXT:    srli a2, a0, 4
+; RV64I-NEXT:    srli a3, a1, 4
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a1, a1, a3
+; RV64I-NEXT:    and a0, a0, a5
+; RV64I-NEXT:    and a1, a1, a5
+; RV64I-NEXT:    slli a2, a0, 8
+; RV64I-NEXT:    slli a3, a1, 8
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a1, a1, a3
+; RV64I-NEXT:    slli a2, a0, 16
+; RV64I-NEXT:    slli a3, a1, 16
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a1, a1, a3
+; RV64I-NEXT:    slli a2, a0, 32
+; RV64I-NEXT:    slli a3, a1, 32
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    add a1, a1, a3
+; RV64I-NEXT:    srli a0, a0, 56
 ; RV64I-NEXT:    srli a1, a1, 56
 ; RV64I-NEXT:    ret
 ;
@@ -1010,9 +1010,9 @@ define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a2, a0, -1
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    addi a2, a1, -1
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    seqz a1, a1
 ; RV64I-NEXT:    ret
 ;
@@ -1033,9 +1033,9 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a2, a0, -1
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    snez a0, a0
 ; RV64I-NEXT:    addi a2, a1, -1
 ; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    snez a0, a0
 ; RV64I-NEXT:    snez a1, a1
 ; RV64I-NEXT:    ret
 ;
@@ -1044,8 +1044,8 @@ define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind {
 ; RV64ZBB-NEXT:    cpop a1, a1
 ; RV64ZBB-NEXT:    cpop a0, a0
 ; RV64ZBB-NEXT:    sltiu a0, a0, 2
-; RV64ZBB-NEXT:    xori a0, a0, 1
 ; RV64ZBB-NEXT:    sltiu a1, a1, 2
+; RV64ZBB-NEXT:    xori a0, a0, 1
 ; RV64ZBB-NEXT:    xori a1, a1, 1
 ; RV64ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
@@ -1069,8 +1069,8 @@ define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind {
 ; RV64ZBB-NEXT:    cpop a1, a1
 ; RV64ZBB-NEXT:    cpop a0, a0
 ; RV64ZBB-NEXT:    addi a0, a0, -1
-; RV64ZBB-NEXT:    seqz a0, a0
 ; RV64ZBB-NEXT:    addi a1, a1, -1
+; RV64ZBB-NEXT:    seqz a0, a0
 ; RV64ZBB-NEXT:    seqz a1, a1
 ; RV64ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
@@ -1084,10 +1084,10 @@ define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind {
 ; RV64I-NEXT:    addi a2, a0, -1
 ; RV64I-NEXT:    xor a0, a0, a2
 ; RV64I-NEXT:    sltu a0, a2, a0
-; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    addi a2, a1, -1
 ; RV64I-NEXT:    xor a1, a1, a2
 ; RV64I-NEXT:    sltu a1, a2, a1
+; RV64I-NEXT:    xori a0, a0, 1
 ; RV64I-NEXT:    xori a1, a1, 1
 ; RV64I-NEXT:    ret
 ;
@@ -1096,8 +1096,8 @@ define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind {
 ; RV64ZBB-NEXT:    cpop a1, a1
 ; RV64ZBB-NEXT:    cpop a0, a0
 ; RV64ZBB-NEXT:    addi a0, a0, -1
-; RV64ZBB-NEXT:    snez a0, a0
 ; RV64ZBB-NEXT:    addi a1, a1, -1
+; RV64ZBB-NEXT:    snez a0, a0
 ; RV64ZBB-NEXT:    snez a1, a1
 ; RV64ZBB-NEXT:    ret
   %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
@@ -1406,11 +1406,11 @@ define signext i32 @bswap_i32(i32 signext %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    srliw a3, a0, 24
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    and a2, a0, a2
+; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    slli a2, a2, 8
 ; RV64I-NEXT:    slliw a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a2
@@ -1432,11 +1432,11 @@ define void @bswap_i32_nosext(i32 signext %a, ptr %x) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a2, a0, 8
 ; RV64I-NEXT:    lui a3, 16
+; RV64I-NEXT:    srliw a4, a0, 24
 ; RV64I-NEXT:    addi a3, a3, -256
 ; RV64I-NEXT:    and a2, a2, a3
-; RV64I-NEXT:    srliw a4, a0, 24
-; RV64I-NEXT:    or a2, a2, a4
 ; RV64I-NEXT:    and a3, a0, a3
+; RV64I-NEXT:    or a2, a2, a4
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a3
@@ -1462,28 +1462,28 @@ define i64 @bswap_i64(i64 %a) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srli a1, a0, 40
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    srli a3, a0, 56
+; RV64I-NEXT:    srli a4, a0, 24
+; RV64I-NEXT:    lui a5, 4080
 ; RV64I-NEXT:    addiw a2, a2, -256
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srli a3, a0, 56
 ; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    srli a3, a0, 24
-; RV64I-NEXT:    lui a4, 4080
-; RV64I-NEXT:    and a3, a3, a4
-; RV64I-NEXT:    srli a5, a0, 8
-; RV64I-NEXT:    srliw a5, a5, 24
-; RV64I-NEXT:    slli a5, a5, 24
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    and a4, a0, a4
-; RV64I-NEXT:    slli a4, a4, 24
-; RV64I-NEXT:    srliw a3, a0, 24
-; RV64I-NEXT:    slli a3, a3, 32
-; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    srli a3, a0, 8
+; RV64I-NEXT:    and a4, a4, a5
+; RV64I-NEXT:    srliw a3, a3, 24
+; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    srliw a4, a0, 24
+; RV64I-NEXT:    and a5, a0, a5
 ; RV64I-NEXT:    and a2, a0, a2
-; RV64I-NEXT:    slli a2, a2, 40
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a4, a5, a4
+; RV64I-NEXT:    slli a2, a2, 40
+; RV64I-NEXT:    or a1, a3, a1
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
index 4aa6cd42ab0994..985837d05caa20 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbkb.ll
@@ -275,8 +275,8 @@ define i64 @pack_i64_allWUsers(i32 signext %0, i32 signext %1, i32 signext %2) {
 ; RV64I-LABEL: pack_i64_allWUsers:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a2, a2, 32
+; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a2, a2, 32
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll b/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll
index da477aa2043cf1..a6ef184abe5e19 100644
--- a/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/65704-illegal-instruction.ll
@@ -25,11 +25,9 @@ define void @foo(<vscale x 8 x i8> %0) {
 ; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vsetivli zero, 0, e8, m1, tu, ma
 ; CHECK-NEXT:    vslideup.vi v9, v10, 0
+; CHECK-NEXT:    vslideup.vi v8, v10, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv.x.s s1, v9
-; CHECK-NEXT:    vsetvli zero, zero, e8, m1, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 0
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv.x.s s2, v8
 ; CHECK-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    mv a0, s1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
index 74693e655bf037..163d9145bc3623 100644
--- a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
@@ -575,12 +575,12 @@ define <vscale x 16 x i64> @vp_abs_nxv16i64(<vscale x 16 x i64> %va, <vscale x 1
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v8, v16, 0, v0.t
 ; CHECK-NEXT:    vmax.vv v8, v16, v8, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
index 08b310213d16e1..4ade6c09fe43de 100644
--- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
@@ -125,30 +125,30 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vsaddu.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_0)
 ; CHECK-NEXT:    vle8.v v16, (a0)
-; CHECK-NEXT:    vmsltu.vx v0, v8, a2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_1)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_1)
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vsext.vf8 v24, v16
-; CHECK-NEXT:    vsaddu.vx v16, v24, a1
-; CHECK-NEXT:    vmsltu.vx v9, v16, a2
-; CHECK-NEXT:    vsext.vf8 v16, v8
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
+; CHECK-NEXT:    vle8.v v17, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.LCPI9_2)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI9_2)
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vmsltu.vx v10, v16, a2
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vle8.v v18, (a0)
+; CHECK-NEXT:    vmsltu.vx v0, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v16
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v16, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v17
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v17, v8, a2
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v9, 2
+; CHECK-NEXT:    vslideup.vi v0, v16, 2
 ; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v10, 4
+; CHECK-NEXT:    vslideup.vi v0, v17, 4
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf8 v16, v8
-; CHECK-NEXT:    vsaddu.vx v8, v16, a1
+; CHECK-NEXT:    vsext.vf8 v8, v18
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
 ; CHECK-NEXT:    vmsltu.vx v16, v8, a2
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v0, v16, 6
@@ -163,65 +163,60 @@ define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_0)
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v16, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_1)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_1)
-; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vsext.vf8 v16, v8
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v8, v16, a2
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
+; CHECK-NEXT:    vle8.v v17, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_2)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_2)
-; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vmsltu.vx v10, v16, a2
+; CHECK-NEXT:    vle8.v v18, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_3)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_3)
-; CHECK-NEXT:    vle8.v v11, (a0)
-; CHECK-NEXT:    vsext.vf8 v16, v9
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v9, v16, a2
-; CHECK-NEXT:    vsext.vf8 v16, v11
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v11, v16, a2
-; CHECK-NEXT:    vid.v v16
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vle8.v v19, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_4)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_4)
-; CHECK-NEXT:    vle8.v v12, (a0)
-; CHECK-NEXT:    vmsltu.vx v0, v16, a2
+; CHECK-NEXT:    vle8.v v20, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_5)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_5)
-; CHECK-NEXT:    vle8.v v13, (a0)
-; CHECK-NEXT:    vsext.vf8 v16, v12
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v12, v16, a2
-; CHECK-NEXT:    vsext.vf8 v16, v13
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v13, v16, a2
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v10, v8, 2
-; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v10, v9, 4
+; CHECK-NEXT:    vle8.v v21, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.LCPI10_6)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI10_6)
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v11, 6
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vle8.v v22, (a0)
+; CHECK-NEXT:    vmsltu.vx v0, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v16
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v16, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v17
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v17, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v18
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v18, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v19
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v19, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v20
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v20, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v21
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v21, v8, a2
+; CHECK-NEXT:    vsext.vf8 v8, v22
+; CHECK-NEXT:    vsaddu.vx v8, v8, a1
+; CHECK-NEXT:    vmsltu.vx v22, v8, a2
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v12, 2
+; CHECK-NEXT:    vslideup.vi v17, v16, 2
+; CHECK-NEXT:    vslideup.vi v0, v20, 2
 ; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v13, 4
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vsext.vf8 v16, v8
-; CHECK-NEXT:    vsaddu.vx v16, v16, a1
-; CHECK-NEXT:    vmsltu.vx v8, v16, a2
+; CHECK-NEXT:    vslideup.vi v17, v18, 4
+; CHECK-NEXT:    vslideup.vi v0, v21, 4
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v0, v8, 6
+; CHECK-NEXT:    vslideup.vi v17, v19, 6
+; CHECK-NEXT:    vslideup.vi v0, v22, 6
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v0, v10, 8
+; CHECK-NEXT:    vslideup.vi v0, v17, 8
 ; CHECK-NEXT:    ret
   %mask = call <128 x i1> @llvm.get.active.lane.mask.v128i1.i64(i64 %index, i64 %tc)
   ret <128 x i1> %mask

diff  --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
index abe1920e437842..9ac2775d306682 100644
--- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
@@ -16,18 +16,18 @@ define void @test(ptr %addr) {
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb
 ; CHECK-NEXT:    csrrs a1, vlenb, zero
-; CHECK-NEXT:    add a2, a0, a1
-; CHECK-NEXT:    vl1re64.v v8, (a2)
+; CHECK-NEXT:    vl1re64.v v8, (a0)
 ; CHECK-NEXT:    slli a2, a1, 1
-; CHECK-NEXT:    vl1re64.v v9, (a0)
-; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    vl1re64.v v10, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs1r.v v9, (a0)
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vs1r.v v10, (a2)
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    vl1re64.v v9, (a3)
+; CHECK-NEXT:    addi a3, sp, 16
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vs1r.v v8, (a0)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl1re64.v v10, (a0)
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    vs1r.v v8, (a3)
+; CHECK-NEXT:    vs1r.v v9, (a2)
+; CHECK-NEXT:    vs1r.v v10, (a1)
 ; CHECK-NEXT:    csrrs a0, vlenb, zero
 ; CHECK-NEXT:    slli a1, a0, 1
 ; CHECK-NEXT:    add a0, a1, a0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
index 97903396679158..fb25d4e15e40e0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
@@ -20,8 +20,8 @@ define <vscale x 1 x double> @test(ptr %addr, i64 %vl) {
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    vl1re64.v v9, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs1r.v v8, (a0)
 ; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    vs1r.v v8, (a0)
 ; CHECK-NEXT:    vs1r.v v9, (a2)
 ; CHECK-NEXT:    vl1re64.v v8, (a2)
 ; CHECK-NEXT:    vl1re64.v v9, (a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-vector-tuple.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-vector-tuple.ll
index 4cd1b045529e3f..853f937bbd2301 100644
--- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-vector-tuple.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-vector-tuple.ll
@@ -14,8 +14,8 @@ define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @load_store_m1x5(targe
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs1r.v v8, (a0)
 ; CHECK-NEXT:    csrrs a1, vlenb, zero
+; CHECK-NEXT:    vs1r.v v8, (a0)
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    vs1r.v v9, (a2)
 ; CHECK-NEXT:    add a3, a2, a1
@@ -57,8 +57,8 @@ define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @load_store_m2x2(targ
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs2r.v v8, (a0)
 ; CHECK-NEXT:    csrrs a1, vlenb, zero
+; CHECK-NEXT:    vs2r.v v8, (a0)
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    vs2r.v v10, (a1)
@@ -92,8 +92,8 @@ define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @load_store_m4x2(targ
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs4r.v v8, (a0)
 ; CHECK-NEXT:    csrrs a1, vlenb, zero
+; CHECK-NEXT:    vs4r.v v8, (a0)
 ; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    vs4r.v v12, (a1)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
index 43be8feece23c1..7fe6bd24a2552d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
@@ -774,8 +774,8 @@ define void @lmul_16_align() nounwind {
 ; NOZBA-NEXT:    csrr a0, vlenb
 ; NOZBA-NEXT:    add a0, sp, a0
 ; NOZBA-NEXT:    addi a0, a0, 128
-; NOZBA-NEXT:    vs8r.v v8, (a0)
 ; NOZBA-NEXT:    csrr a1, vlenb
+; NOZBA-NEXT:    vs8r.v v8, (a0)
 ; NOZBA-NEXT:    slli a1, a1, 3
 ; NOZBA-NEXT:    add a0, a0, a1
 ; NOZBA-NEXT:    vs8r.v v8, (a0)
@@ -805,8 +805,8 @@ define void @lmul_16_align() nounwind {
 ; ZBA-NEXT:    csrr a0, vlenb
 ; ZBA-NEXT:    add a0, sp, a0
 ; ZBA-NEXT:    addi a0, a0, 128
-; ZBA-NEXT:    vs8r.v v8, (a0)
 ; ZBA-NEXT:    csrr a1, vlenb
+; ZBA-NEXT:    vs8r.v v8, (a0)
 ; ZBA-NEXT:    sh3add a0, a1, a0
 ; ZBA-NEXT:    vs8r.v v8, (a0)
 ; ZBA-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
@@ -837,8 +837,8 @@ define void @lmul_16_align() nounwind {
 ; NOMUL-NEXT:    csrr a0, vlenb
 ; NOMUL-NEXT:    add a0, sp, a0
 ; NOMUL-NEXT:    addi a0, a0, 128
-; NOMUL-NEXT:    vs8r.v v8, (a0)
 ; NOMUL-NEXT:    csrr a1, vlenb
+; NOMUL-NEXT:    vs8r.v v8, (a0)
 ; NOMUL-NEXT:    slli a1, a1, 3
 ; NOMUL-NEXT:    add a0, a0, a1
 ; NOMUL-NEXT:    vs8r.v v8, (a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
index 29d19ed38bbeda..1ed84316d4484c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-sdnode.ll
@@ -10,17 +10,17 @@ define <vscale x 1 x i8> @bitreverse_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsll.vi v9, v8, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -41,17 +41,17 @@ define <vscale x 2 x i8> @bitreverse_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsll.vi v9, v8, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -72,17 +72,17 @@ define <vscale x 4 x i8> @bitreverse_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsll.vi v9, v8, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -103,17 +103,17 @@ define <vscale x 8 x i8> @bitreverse_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vsll.vi v9, v8, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -134,17 +134,17 @@ define <vscale x 16 x i8> @bitreverse_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vsll.vi v10, v8, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
@@ -165,17 +165,17 @@ define <vscale x 32 x i8> @bitreverse_nxv32i8(<vscale x 32 x i8> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsll.vi v12, v8, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
@@ -196,17 +196,17 @@ define <vscale x 64 x i8> @bitreverse_nxv64i8(<vscale x 64 x i8> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vsll.vi v16, v8, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
@@ -227,26 +227,26 @@ define <vscale x 1 x i16> @bitreverse_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -267,26 +267,26 @@ define <vscale x 2 x i16> @bitreverse_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -307,26 +307,26 @@ define <vscale x 4 x i16> @bitreverse_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -347,26 +347,26 @@ define <vscale x 8 x i16> @bitreverse_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
@@ -387,26 +387,26 @@ define <vscale x 16 x i16> @bitreverse_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
@@ -427,26 +427,26 @@ define <vscale x 32 x i16> @bitreverse_nxv32i16(<vscale x 32 x i16> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
@@ -467,34 +467,34 @@ define <vscale x 1 x i32> @bitreverse_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsll.vi v10, v10, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vsll.vi v10, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -515,34 +515,34 @@ define <vscale x 2 x i32> @bitreverse_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsll.vi v10, v10, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vsll.vi v10, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -563,34 +563,34 @@ define <vscale x 4 x i32> @bitreverse_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    vor.vv v10, v10, v12
-; CHECK-NEXT:    vand.vx v12, v8, a0
-; CHECK-NEXT:    vsll.vi v12, v12, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
+; CHECK-NEXT:    vsll.vi v12, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vor.vv v8, v8, v10
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
@@ -611,34 +611,34 @@ define <vscale x 8 x i32> @bitreverse_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    vor.vv v12, v12, v16
-; CHECK-NEXT:    vand.vx v16, v8, a0
-; CHECK-NEXT:    vsll.vi v16, v16, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 4
+; CHECK-NEXT:    vsll.vi v16, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    vor.vv v8, v8, v12
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
@@ -659,34 +659,34 @@ define <vscale x 16 x i32> @bitreverse_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v24, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsrl.vi v24, v8, 24
 ; CHECK-NEXT:    vor.vv v16, v16, v24
-; CHECK-NEXT:    vand.vx v24, v8, a0
-; CHECK-NEXT:    vsll.vi v24, v24, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v24
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vsll.vi v24, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v24, v8
+; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
@@ -707,65 +707,65 @@ define <vscale x 1 x i64> @bitreverse_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a0, 1044480
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v9, v8, 24
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    vsrl.vx v10, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v10, a2
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsrl.vi v10, v8, 24
-; RV32-NEXT:    addi a3, sp, 8
-; RV32-NEXT:    vlse64.v v11, (a3), zero
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v10, v10, a3
+; RV32-NEXT:    vsrl.vx v11, v8, a2
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vsll.vx v12, v8, a1
+; RV32-NEXT:    vand.vx v11, v11, a0
+; RV32-NEXT:    vlse64.v v13, (a5), zero
+; RV32-NEXT:    vor.vv v10, v11, v10
+; RV32-NEXT:    vand.vx v11, v8, a0
+; RV32-NEXT:    vsll.vx v11, v11, a2
+; RV32-NEXT:    vor.vv v11, v12, v11
 ; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vand.vv v12, v12, v11
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsll.vx v10, v8, a0
-; RV32-NEXT:    vand.vx v12, v8, a2
-; RV32-NEXT:    vsll.vx v12, v12, a1
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    vand.vx v12, v8, a3
-; RV32-NEXT:    vsll.vi v12, v12, 24
-; RV32-NEXT:    vand.vv v8, v8, v11
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vor.vv v8, v10, v8
-; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vand.vx v9, v9, a4
+; RV32-NEXT:    vand.vv v12, v12, v13
+; RV32-NEXT:    vor.vv v9, v12, v9
 ; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    vand.vv v12, v8, v13
+; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vor.vv v9, v9, v10
+; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
+; RV32-NEXT:    vsll.vi v12, v12, 8
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT:    vor.vv v8, v11, v8
+; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v11, a2
+; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vsrl.vi v9, v8, 4
 ; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 2
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v9, v9, v12
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vand.vv v9, v9, v11
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -774,60 +774,60 @@ define <vscale x 1 x i64> @bitreverse_nxv1i64(<vscale x 1 x i64> %va) {
 ;
 ; RV64-LABEL: bitreverse_nxv1i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vx v9, v8, a0
-; RV64-NEXT:    li a1, 40
-; RV64-NEXT:    vsrl.vx v10, v8, a1
+; RV64-NEXT:    li a1, 56
+; RV64-NEXT:    li a0, 40
 ; RV64-NEXT:    lui a2, 16
-; RV64-NEXT:    addiw a2, a2, -256
-; RV64-NEXT:    vand.vx v10, v10, a2
-; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsrl.vi v10, v8, 24
+; RV64-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
+; RV64-NEXT:    vsrl.vi v9, v8, 24
 ; RV64-NEXT:    lui a3, 4080
-; RV64-NEXT:    vand.vx v10, v10, a3
+; RV64-NEXT:    vsrl.vx v10, v8, a1
+; RV64-NEXT:    vsrl.vx v11, v8, a0
+; RV64-NEXT:    addiw a2, a2, -256
+; RV64-NEXT:    vand.vx v11, v11, a2
+; RV64-NEXT:    vor.vv v10, v11, v10
 ; RV64-NEXT:    vsrl.vi v11, v8, 8
 ; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    vand.vx v9, v9, a3
 ; RV64-NEXT:    slli a4, a4, 24
 ; RV64-NEXT:    vand.vx v11, v11, a4
+; RV64-NEXT:    vor.vv v9, v11, v9
+; RV64-NEXT:    vand.vx v11, v8, a3
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    vor.vv v9, v9, v10
+; RV64-NEXT:    vand.vx v10, v8, a4
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    vsll.vi v11, v11, 24
+; RV64-NEXT:    vsll.vi v10, v10, 8
 ; RV64-NEXT:    vor.vv v10, v11, v10
-; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vand.vx v10, v8, a3
-; RV64-NEXT:    vsll.vi v10, v10, 24
-; RV64-NEXT:    vand.vx v11, v8, a4
-; RV64-NEXT:    vsll.vi v11, v11, 8
-; RV64-NEXT:    vor.vv v10, v10, v11
-; RV64-NEXT:    vsll.vx v11, v8, a0
+; RV64-NEXT:    vsll.vx v11, v8, a1
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 819
+; RV64-NEXT:    addiw a1, a1, 1365
 ; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vsll.vx v8, v8, a1
+; RV64-NEXT:    slli a2, a3, 32
+; RV64-NEXT:    vsll.vx v8, v8, a0
+; RV64-NEXT:    slli a0, a4, 32
+; RV64-NEXT:    add a2, a3, a2
+; RV64-NEXT:    slli a3, a1, 32
+; RV64-NEXT:    add a0, a4, a0
+; RV64-NEXT:    add a1, a1, a3
 ; RV64-NEXT:    vor.vv v8, v11, v8
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v9, v9, a2
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    ret
@@ -848,65 +848,65 @@ define <vscale x 2 x i64> @bitreverse_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a0, 1044480
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    vsetvli a4, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vx v10, v8, a0
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    vsrl.vx v12, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsrl.vi v12, v8, 24
-; RV32-NEXT:    addi a3, sp, 8
-; RV32-NEXT:    vlse64.v v14, (a3), zero
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v12, v12, a3
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vand.vv v16, v16, v14
-; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsll.vx v12, v8, a0
-; RV32-NEXT:    vand.vx v16, v8, a2
-; RV32-NEXT:    vsll.vx v16, v16, a1
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    vand.vx v16, v8, a3
-; RV32-NEXT:    vsll.vi v16, v16, 24
-; RV32-NEXT:    vand.vv v8, v8, v14
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vsrl.vx v10, v8, a1
+; RV32-NEXT:    vsrl.vx v12, v8, a2
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vsll.vx v18, v8, a1
+; RV32-NEXT:    vand.vx v12, v12, a0
+; RV32-NEXT:    vlse64.v v14, (a5), zero
+; RV32-NEXT:    vor.vv v12, v12, v10
+; RV32-NEXT:    vand.vx v10, v8, a0
+; RV32-NEXT:    vsll.vx v10, v10, a2
+; RV32-NEXT:    vor.vv v10, v18, v10
+; RV32-NEXT:    vsrl.vi v18, v8, 8
+; RV32-NEXT:    vand.vx v16, v16, a4
+; RV32-NEXT:    vand.vv v18, v18, v14
+; RV32-NEXT:    vor.vv v16, v18, v16
 ; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    vand.vv v14, v8, v14
+; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vor.vv v12, v16, v12
+; RV32-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    vor.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 2
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vsll.vi v14, v14, 8
+; RV32-NEXT:    vor.vv v8, v8, v14
+; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v14, a1
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a2
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v12, v12, v16
+; RV32-NEXT:    vsll.vi v8, v8, 4
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v14
+; RV32-NEXT:    vand.vv v12, v12, v14
+; RV32-NEXT:    vsll.vi v8, v8, 2
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v12, v10
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -915,60 +915,60 @@ define <vscale x 2 x i64> @bitreverse_nxv2i64(<vscale x 2 x i64> %va) {
 ;
 ; RV64-LABEL: bitreverse_nxv2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vx v10, v8, a0
-; RV64-NEXT:    li a1, 40
-; RV64-NEXT:    vsrl.vx v12, v8, a1
+; RV64-NEXT:    li a1, 56
+; RV64-NEXT:    li a0, 40
 ; RV64-NEXT:    lui a2, 16
-; RV64-NEXT:    addiw a2, a2, -256
-; RV64-NEXT:    vand.vx v12, v12, a2
-; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vsrl.vi v12, v8, 24
+; RV64-NEXT:    vsetvli a3, zero, e64, m2, ta, ma
+; RV64-NEXT:    vsrl.vi v10, v8, 24
 ; RV64-NEXT:    lui a3, 4080
-; RV64-NEXT:    vand.vx v12, v12, a3
+; RV64-NEXT:    vsrl.vx v12, v8, a1
+; RV64-NEXT:    vsrl.vx v14, v8, a0
+; RV64-NEXT:    addiw a2, a2, -256
+; RV64-NEXT:    vand.vx v14, v14, a2
+; RV64-NEXT:    vor.vv v12, v14, v12
 ; RV64-NEXT:    vsrl.vi v14, v8, 8
 ; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    vand.vx v10, v10, a3
 ; RV64-NEXT:    slli a4, a4, 24
 ; RV64-NEXT:    vand.vx v14, v14, a4
+; RV64-NEXT:    vor.vv v10, v14, v10
+; RV64-NEXT:    vand.vx v14, v8, a3
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    vor.vv v10, v10, v12
+; RV64-NEXT:    vand.vx v12, v8, a4
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    vsll.vi v14, v14, 24
+; RV64-NEXT:    vsll.vi v12, v12, 8
 ; RV64-NEXT:    vor.vv v12, v14, v12
-; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vand.vx v12, v8, a3
-; RV64-NEXT:    vsll.vi v12, v12, 24
-; RV64-NEXT:    vand.vx v14, v8, a4
-; RV64-NEXT:    vsll.vi v14, v14, 8
-; RV64-NEXT:    vor.vv v12, v12, v14
-; RV64-NEXT:    vsll.vx v14, v8, a0
+; RV64-NEXT:    vsll.vx v14, v8, a1
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 819
+; RV64-NEXT:    addiw a1, a1, 1365
 ; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vsll.vx v8, v8, a1
+; RV64-NEXT:    slli a2, a3, 32
+; RV64-NEXT:    vsll.vx v8, v8, a0
+; RV64-NEXT:    slli a0, a4, 32
+; RV64-NEXT:    add a2, a3, a2
+; RV64-NEXT:    slli a3, a1, 32
+; RV64-NEXT:    add a0, a4, a0
+; RV64-NEXT:    add a1, a1, a3
 ; RV64-NEXT:    vor.vv v8, v14, v8
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v10, v10, a2
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v10, v10, a1
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    ret
@@ -989,65 +989,65 @@ define <vscale x 4 x i64> @bitreverse_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a0, 1044480
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    vsetvli a4, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 24
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vx v12, v8, a0
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    addi a3, sp, 8
-; RV32-NEXT:    vlse64.v v20, (a3), zero
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v16, v16, a3
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v24, v24, v20
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vsll.vx v16, v8, a0
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    vsll.vx v24, v24, a1
-; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    vand.vx v24, v8, a3
-; RV32-NEXT:    vsll.vi v24, v24, 24
-; RV32-NEXT:    vand.vv v8, v8, v20
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vsrl.vx v12, v8, a1
+; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vsll.vx v28, v8, a1
+; RV32-NEXT:    vand.vx v16, v16, a0
+; RV32-NEXT:    vlse64.v v20, (a5), zero
+; RV32-NEXT:    vor.vv v16, v16, v12
+; RV32-NEXT:    vand.vx v12, v8, a0
+; RV32-NEXT:    vsll.vx v12, v12, a2
+; RV32-NEXT:    vor.vv v12, v28, v12
+; RV32-NEXT:    vsrl.vi v28, v8, 8
+; RV32-NEXT:    vand.vx v24, v24, a4
+; RV32-NEXT:    vand.vv v28, v28, v20
+; RV32-NEXT:    vor.vv v24, v28, v24
 ; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    vand.vv v20, v8, v20
+; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vsetvli a3, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 2
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    vsll.vi v20, v20, 8
+; RV32-NEXT:    vor.vv v8, v8, v20
+; RV32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v20, a1
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a2
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsll.vi v8, v8, 4
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v20
+; RV32-NEXT:    vand.vv v16, v16, v20
+; RV32-NEXT:    vsll.vi v8, v8, 2
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v16, v12
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1056,60 +1056,60 @@ define <vscale x 4 x i64> @bitreverse_nxv4i64(<vscale x 4 x i64> %va) {
 ;
 ; RV64-LABEL: bitreverse_nxv4i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vx v12, v8, a0
-; RV64-NEXT:    li a1, 40
-; RV64-NEXT:    vsrl.vx v16, v8, a1
+; RV64-NEXT:    li a1, 56
+; RV64-NEXT:    li a0, 40
 ; RV64-NEXT:    lui a2, 16
-; RV64-NEXT:    addiw a2, a2, -256
-; RV64-NEXT:    vand.vx v16, v16, a2
-; RV64-NEXT:    vor.vv v12, v16, v12
+; RV64-NEXT:    vsetvli a3, zero, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 24
 ; RV64-NEXT:    lui a3, 4080
-; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vsrl.vx v12, v8, a1
+; RV64-NEXT:    vsrl.vx v20, v8, a0
+; RV64-NEXT:    addiw a2, a2, -256
+; RV64-NEXT:    vand.vx v20, v20, a2
+; RV64-NEXT:    vor.vv v12, v20, v12
 ; RV64-NEXT:    vsrl.vi v20, v8, 8
 ; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    vand.vx v16, v16, a3
 ; RV64-NEXT:    slli a4, a4, 24
 ; RV64-NEXT:    vand.vx v20, v20, a4
-; RV64-NEXT:    vor.vv v16, v20, v16
-; RV64-NEXT:    vor.vv v12, v16, v12
+; RV64-NEXT:    vor.vv v20, v20, v16
 ; RV64-NEXT:    vand.vx v16, v8, a3
-; RV64-NEXT:    vsll.vi v16, v16, 24
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    vor.vv v12, v20, v12
 ; RV64-NEXT:    vand.vx v20, v8, a4
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    vsll.vi v16, v16, 24
 ; RV64-NEXT:    vsll.vi v20, v20, 8
 ; RV64-NEXT:    vor.vv v16, v16, v20
-; RV64-NEXT:    vsll.vx v20, v8, a0
+; RV64-NEXT:    vsll.vx v20, v8, a1
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 819
+; RV64-NEXT:    addiw a1, a1, 1365
 ; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vsll.vx v8, v8, a1
+; RV64-NEXT:    slli a2, a3, 32
+; RV64-NEXT:    vsll.vx v8, v8, a0
+; RV64-NEXT:    slli a0, a4, 32
+; RV64-NEXT:    add a2, a3, a2
+; RV64-NEXT:    slli a3, a1, 32
+; RV64-NEXT:    add a0, a4, a0
+; RV64-NEXT:    add a1, a1, a3
 ; RV64-NEXT:    vor.vv v8, v20, v8
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v12, v12, a2
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v12, v12, a1
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    ret
@@ -1130,80 +1130,87 @@ define <vscale x 8 x i64> @bitreverse_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    sub sp, sp, a0
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a0, 1044480
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a0
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    vsrl.vx v24, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v24, a2
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v16, v8, a1
+; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vsll.vx v0, v8, a1
+; RV32-NEXT:    vand.vx v24, v24, a0
 ; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v0, v8, 24
-; RV32-NEXT:    addi a3, sp, 8
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v0, v0, a3
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v16, v8, a0
+; RV32-NEXT:    vsll.vx v16, v16, a2
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v0, (a5), zero
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a4
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a2
-; RV32-NEXT:    vsll.vx v0, v0, a1
-; RV32-NEXT:    vsll.vx v24, v8, a0
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vand.vv v16, v8, v16
-; RV32-NEXT:    vand.vx v8, v8, a3
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24
+; RV32-NEXT:    vand.vv v16, v8, v0
+; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v16, v16, 8
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    lui a2, 349525
 ; RV32-NEXT:    addi a0, a0, -241
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a0
+; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    lui a0, 209715
-; RV32-NEXT:    addi a0, a0, 819
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a0, 349525
-; RV32-NEXT:    addi a0, a0, 1365
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1212,60 +1219,60 @@ define <vscale x 8 x i64> @bitreverse_nxv8i64(<vscale x 8 x i64> %va) {
 ;
 ; RV64-LABEL: bitreverse_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v16, v8, a0
-; RV64-NEXT:    li a1, 40
-; RV64-NEXT:    vsrl.vx v24, v8, a1
+; RV64-NEXT:    li a1, 56
+; RV64-NEXT:    li a0, 40
 ; RV64-NEXT:    lui a2, 16
-; RV64-NEXT:    addiw a2, a2, -256
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 24
 ; RV64-NEXT:    lui a3, 4080
-; RV64-NEXT:    vand.vx v24, v24, a3
+; RV64-NEXT:    vsrl.vx v16, v8, a1
+; RV64-NEXT:    vsrl.vx v0, v8, a0
+; RV64-NEXT:    addiw a2, a2, -256
+; RV64-NEXT:    vand.vx v0, v0, a2
+; RV64-NEXT:    vor.vv v16, v0, v16
 ; RV64-NEXT:    vsrl.vi v0, v8, 8
 ; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    vand.vx v24, v24, a3
 ; RV64-NEXT:    slli a4, a4, 24
 ; RV64-NEXT:    vand.vx v0, v0, a4
-; RV64-NEXT:    vor.vv v24, v0, v24
-; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vor.vv v0, v0, v24
 ; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsll.vi v24, v24, 24
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    vor.vv v16, v0, v16
 ; RV64-NEXT:    vand.vx v0, v8, a4
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    vsll.vi v24, v24, 24
 ; RV64-NEXT:    vsll.vi v0, v0, 8
 ; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    vsll.vx v0, v8, a0
+; RV64-NEXT:    vsll.vx v0, v8, a1
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 819
+; RV64-NEXT:    addiw a1, a1, 1365
 ; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vsll.vx v8, v8, a1
+; RV64-NEXT:    slli a2, a3, 32
+; RV64-NEXT:    vsll.vx v8, v8, a0
+; RV64-NEXT:    slli a0, a4, 32
+; RV64-NEXT:    add a2, a3, a2
+; RV64-NEXT:    slli a3, a1, 32
+; RV64-NEXT:    add a0, a4, a0
+; RV64-NEXT:    add a1, a1, a3
 ; RV64-NEXT:    vor.vv v8, v0, v8
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v16, v16, a2
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 8abe35bf1d97ec..66a1178cddb66c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -15,18 +15,18 @@ define <vscale x 1 x i8> @vp_bitreverse_nxv1i8(<vscale x 1 x i8> %va, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v9, v9, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -47,20 +47,20 @@ define <vscale x 1 x i8> @vp_bitreverse_nxv1i8_unmasked(<vscale x 1 x i8> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15
-; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -81,18 +81,18 @@ define <vscale x 2 x i8> @vp_bitreverse_nxv2i8(<vscale x 2 x i8> %va, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v9, v9, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -113,20 +113,20 @@ define <vscale x 2 x i8> @vp_bitreverse_nxv2i8_unmasked(<vscale x 2 x i8> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15
-; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -147,18 +147,18 @@ define <vscale x 4 x i8> @vp_bitreverse_nxv4i8(<vscale x 4 x i8> %va, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v9, v9, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -179,20 +179,20 @@ define <vscale x 4 x i8> @vp_bitreverse_nxv4i8_unmasked(<vscale x 4 x i8> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15
-; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -213,18 +213,18 @@ define <vscale x 8 x i8> @vp_bitreverse_nxv8i8(<vscale x 8 x i8> %va, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v9, v9, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -245,20 +245,20 @@ define <vscale x 8 x i8> @vp_bitreverse_nxv8i8_unmasked(<vscale x 8 x i8> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15
-; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -279,18 +279,18 @@ define <vscale x 16 x i8> @vp_bitreverse_nxv16i8(<vscale x 16 x i8> %va, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v10, v10, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -311,20 +311,20 @@ define <vscale x 16 x i8> @vp_bitreverse_nxv16i8_unmasked(<vscale x 16 x i8> %va
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v8, 15
-; CHECK-NEXT:    vsll.vi v10, v10, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v10, v10, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
@@ -345,18 +345,18 @@ define <vscale x 32 x i8> @vp_bitreverse_nxv32i8(<vscale x 32 x i8> %va, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vand.vi v12, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v12, v12, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -377,20 +377,20 @@ define <vscale x 32 x i8> @vp_bitreverse_nxv32i8_unmasked(<vscale x 32 x i8> %va
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vand.vi v12, v8, 15
-; CHECK-NEXT:    vsll.vi v12, v12, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v12, v12, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
@@ -411,18 +411,18 @@ define <vscale x 64 x i8> @vp_bitreverse_nxv64i8(<vscale x 64 x i8> %va, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vand.vi v16, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v16, v16, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v16, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vsll.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -443,20 +443,20 @@ define <vscale x 64 x i8> @vp_bitreverse_nxv64i8_unmasked(<vscale x 64 x i8> %va
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vand.vi v16, v8, 15
-; CHECK-NEXT:    vsll.vi v16, v16, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v16, v16, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
@@ -477,25 +477,25 @@ define <vscale x 1 x i16> @vp_bitreverse_nxv1i16(<vscale x 1 x i16> %va, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -517,26 +517,26 @@ define <vscale x 1 x i16> @vp_bitreverse_nxv1i16_unmasked(<vscale x 1 x i16> %va
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -557,25 +557,25 @@ define <vscale x 2 x i16> @vp_bitreverse_nxv2i16(<vscale x 2 x i16> %va, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -597,26 +597,26 @@ define <vscale x 2 x i16> @vp_bitreverse_nxv2i16_unmasked(<vscale x 2 x i16> %va
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -637,25 +637,25 @@ define <vscale x 4 x i16> @vp_bitreverse_nxv4i16(<vscale x 4 x i16> %va, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -677,26 +677,26 @@ define <vscale x 4 x i16> @vp_bitreverse_nxv4i16_unmasked(<vscale x 4 x i16> %va
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -717,25 +717,25 @@ define <vscale x 8 x i16> @vp_bitreverse_nxv8i16(<vscale x 8 x i16> %va, <vscale
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -757,26 +757,26 @@ define <vscale x 8 x i16> @vp_bitreverse_nxv8i16_unmasked(<vscale x 8 x i16> %va
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
@@ -797,25 +797,25 @@ define <vscale x 16 x i16> @vp_bitreverse_nxv16i16(<vscale x 16 x i16> %va, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -837,26 +837,26 @@ define <vscale x 16 x i16> @vp_bitreverse_nxv16i16_unmasked(<vscale x 16 x i16>
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
@@ -877,25 +877,25 @@ define <vscale x 32 x i16> @vp_bitreverse_nxv32i16(<vscale x 32 x i16> %va, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v16, 4, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vsll.vi v16, v16, 4, v0.t
+; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v16, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vsll.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -917,26 +917,26 @@ define <vscale x 32 x i16> @vp_bitreverse_nxv32i16_unmasked(<vscale x 32 x i16>
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
@@ -963,27 +963,27 @@ define <vscale x 1 x i32> @vp_bitreverse_nxv1i32(<vscale x 1 x i32> %va, <vscale
 ; CHECK-NEXT:    vsrl.vi v10, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vsll.vi v10, v10, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -1005,34 +1005,34 @@ define <vscale x 1 x i32> @vp_bitreverse_nxv1i32_unmasked(<vscale x 1 x i32> %va
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsll.vi v10, v10, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vsll.vi v10, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -1059,27 +1059,27 @@ define <vscale x 2 x i32> @vp_bitreverse_nxv2i32(<vscale x 2 x i32> %va, <vscale
 ; CHECK-NEXT:    vsrl.vi v10, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vsll.vi v10, v10, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -1101,34 +1101,34 @@ define <vscale x 2 x i32> @vp_bitreverse_nxv2i32_unmasked(<vscale x 2 x i32> %va
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsll.vi v10, v10, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vsll.vi v10, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -1155,27 +1155,27 @@ define <vscale x 4 x i32> @vp_bitreverse_nxv4i32(<vscale x 4 x i32> %va, <vscale
 ; CHECK-NEXT:    vsrl.vi v12, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v10, v10, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vsll.vi v12, v12, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -1197,34 +1197,34 @@ define <vscale x 4 x i32> @vp_bitreverse_nxv4i32_unmasked(<vscale x 4 x i32> %va
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    vor.vv v10, v10, v12
-; CHECK-NEXT:    vand.vx v12, v8, a0
-; CHECK-NEXT:    vsll.vi v12, v12, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
+; CHECK-NEXT:    vsll.vi v12, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vor.vv v8, v8, v10
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
@@ -1251,27 +1251,27 @@ define <vscale x 8 x i32> @vp_bitreverse_nxv8i32(<vscale x 8 x i32> %va, <vscale
 ; CHECK-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v12, v12, v16, v0.t
 ; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -1293,34 +1293,34 @@ define <vscale x 8 x i32> @vp_bitreverse_nxv8i32_unmasked(<vscale x 8 x i32> %va
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    vor.vv v12, v12, v16
-; CHECK-NEXT:    vand.vx v16, v8, a0
-; CHECK-NEXT:    vsll.vi v16, v16, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 4
+; CHECK-NEXT:    vsll.vi v16, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    vor.vv v8, v8, v12
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
@@ -1347,27 +1347,27 @@ define <vscale x 16 x i32> @vp_bitreverse_nxv16i32(<vscale x 16 x i32> %va, <vsc
 ; CHECK-NEXT:    vsrl.vi v24, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vand.vx v24, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v24, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v16, 4, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vsll.vi v16, v16, 4, v0.t
+; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v8, v16, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vsll.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -1389,34 +1389,34 @@ define <vscale x 16 x i32> @vp_bitreverse_nxv16i32_unmasked(<vscale x 16 x i32>
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v24, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsrl.vi v24, v8, 24
 ; CHECK-NEXT:    vor.vv v16, v16, v24
-; CHECK-NEXT:    vand.vx v24, v8, a0
-; CHECK-NEXT:    vsll.vi v24, v24, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v24
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 4
+; CHECK-NEXT:    vsll.vi v24, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v24, v8
+; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v16, v16, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
@@ -1437,68 +1437,67 @@ define <vscale x 1 x i64> @vp_bitreverse_nxv1i64(<vscale x 1 x i64> %va, <vscale
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    lui a5, 16
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v9, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v10, v10, a3, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a6), zero
+; RV32-NEXT:    lui a4, 61681
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v10, v8, a3, v0.t
+; RV32-NEXT:    addi a5, a5, -256
+; RV32-NEXT:    vand.vx v11, v8, a5, v0.t
+; RV32-NEXT:    vsll.vx v11, v11, a2, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v11, v0.t
+; RV32-NEXT:    vand.vx v11, v8, a1, v0.t
 ; RV32-NEXT:    vsll.vi v11, v11, 24, v0.t
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v12, v8, v9, v0.t
 ; RV32-NEXT:    vsll.vi v12, v12, 8, v0.t
 ; RV32-NEXT:    vor.vv v11, v11, v12, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v11, v0.t
-; RV32-NEXT:    vsrl.vx v11, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v12, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a2, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v11, v0.t
+; RV32-NEXT:    vsrl.vx v11, v8, a3, v0.t
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    vsrl.vx v12, v8, a2, v0.t
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    addi a4, a4, -241
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
 ; RV32-NEXT:    vor.vv v11, v12, v11, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a4, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v9, a4
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v11, v0.t
-; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v11, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v9, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v11, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1508,59 +1507,59 @@ define <vscale x 1 x i64> @vp_bitreverse_nxv1i64(<vscale x 1 x i64> %va, <vscale
 ; RV64-LABEL: vp_bitreverse_nxv1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a3, 255
+; RV64-NEXT:    li a2, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 209715
+; RV64-NEXT:    lui a7, 349525
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw a6, a6, 819
+; RV64-NEXT:    addiw a7, a7, 1365
+; RV64-NEXT:    slli t0, a5, 32
+; RV64-NEXT:    add t0, a5, t0
+; RV64-NEXT:    slli a5, a6, 32
+; RV64-NEXT:    add a6, a6, a5
+; RV64-NEXT:    slli a5, a7, 32
+; RV64-NEXT:    add a5, a7, a5
+; RV64-NEXT:    li a7, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
+; RV64-NEXT:    slli a3, a3, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v9, v9, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a3, v0.t
 ; RV64-NEXT:    vsll.vi v10, v10, 8, v0.t
 ; RV64-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    vsll.vx v10, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v11, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v11, v11, a4, v0.t
+; RV64-NEXT:    vand.vx v11, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v11, v11, a7, v0.t
 ; RV64-NEXT:    vor.vv v10, v10, v11, v0.t
 ; RV64-NEXT:    vor.vv v9, v10, v9, v0.t
 ; RV64-NEXT:    vsrl.vx v10, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v11, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v11, v11, a3, v0.t
+; RV64-NEXT:    vsrl.vx v11, v8, a7, v0.t
+; RV64-NEXT:    vand.vx v11, v11, a0, v0.t
 ; RV64-NEXT:    vor.vv v10, v11, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v11, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v11, v11, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v11, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v9, t0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, t0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a5, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    ret
@@ -1580,67 +1579,67 @@ define <vscale x 1 x i64> @vp_bitreverse_nxv1i64_unmasked(<vscale x 1 x i64> %va
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v9, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v9, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v10, v10, a3
-; RV32-NEXT:    vor.vv v9, v9, v10
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a4), zero
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v8, a4
-; RV32-NEXT:    vsll.vi v11, v11, 24
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsll.vi v12, v12, 8
-; RV32-NEXT:    vor.vv v11, v11, v12
-; RV32-NEXT:    vor.vv v9, v9, v11
-; RV32-NEXT:    vsrl.vx v11, v8, a1
-; RV32-NEXT:    vsrl.vx v12, v8, a3
-; RV32-NEXT:    vand.vx v12, v12, a2
+; RV32-NEXT:    vsll.vx v10, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v11, v8, a2
+; RV32-NEXT:    vsrl.vx v12, v8, a4
+; RV32-NEXT:    vand.vx v13, v8, a1
+; RV32-NEXT:    vand.vx v12, v12, a1
 ; RV32-NEXT:    vor.vv v11, v12, v11
-; RV32-NEXT:    vsrl.vi v12, v8, 24
-; RV32-NEXT:    vand.vx v12, v12, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v8, v11
-; RV32-NEXT:    vor.vv v8, v9, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a6), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsll.vx v13, v13, a4
+; RV32-NEXT:    vor.vv v10, v10, v13
+; RV32-NEXT:    vsrl.vi v13, v8, 8
+; RV32-NEXT:    vand.vx v9, v9, a5
+; RV32-NEXT:    vand.vv v13, v13, v12
+; RV32-NEXT:    vor.vv v9, v13, v9
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
+; RV32-NEXT:    vand.vv v12, v8, v12
+; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v12, v12, 8
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v9, v9, v11
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v11, a2
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v9, v9, v12
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vand.vv v9, v9, v11
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1650,59 +1649,59 @@ define <vscale x 1 x i64> @vp_bitreverse_nxv1i64_unmasked(<vscale x 1 x i64> %va
 ; RV64-LABEL: vp_bitreverse_nxv1i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vand.vx v9, v8, a1
-; RV64-NEXT:    vsll.vi v9, v9, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v10, v8, a0
-; RV64-NEXT:    vsll.vi v10, v10, 8
-; RV64-NEXT:    vor.vv v9, v9, v10
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v10, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v11, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v11, v11, a4
-; RV64-NEXT:    vor.vv v10, v10, v11
+; RV64-NEXT:    vsrl.vi v9, v8, 24
+; RV64-NEXT:    vsrl.vi v10, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v11, v8, a3
+; RV64-NEXT:    vsrl.vx v12, v8, a5
+; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    vor.vv v11, v12, v11
+; RV64-NEXT:    vand.vx v12, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v9, v9, a1
+; RV64-NEXT:    vsll.vi v12, v12, 24
+; RV64-NEXT:    vand.vx v10, v10, a2
 ; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsrl.vx v10, v8, a2
-; RV64-NEXT:    vsrl.vx v11, v8, a4
-; RV64-NEXT:    vand.vx v11, v11, a3
-; RV64-NEXT:    vor.vv v10, v11, v10
-; RV64-NEXT:    vsrl.vi v11, v8, 24
-; RV64-NEXT:    vand.vx v11, v11, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v10, v8, a2
+; RV64-NEXT:    vsll.vi v10, v10, 8
+; RV64-NEXT:    vor.vv v10, v12, v10
+; RV64-NEXT:    vsll.vx v12, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v11
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vor.vv v8, v9, v8
-; RV64-NEXT:    vsrl.vi v9, v8, 4
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    vor.vv v9, v9, v11
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    vor.vv v8, v8, v9
+; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v9, v9, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    ret
@@ -1723,68 +1722,67 @@ define <vscale x 2 x i64> @vp_bitreverse_nxv2i64(<vscale x 2 x i64> %va, <vscale
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    lui a5, 16
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vx v10, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v12, v12, a3, v0.t
-; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsetvli a4, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a6), zero
+; RV32-NEXT:    lui a4, 61681
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v14, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v12, v8, a3, v0.t
+; RV32-NEXT:    addi a5, a5, -256
+; RV32-NEXT:    vand.vx v14, v8, a5, v0.t
+; RV32-NEXT:    vsll.vx v14, v14, a2, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v14, v0.t
+; RV32-NEXT:    vand.vx v14, v8, a1, v0.t
 ; RV32-NEXT:    vsll.vi v14, v14, 24, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v10, v0.t
 ; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; RV32-NEXT:    vor.vv v14, v14, v16, v0.t
-; RV32-NEXT:    vor.vv v10, v10, v14, v0.t
-; RV32-NEXT:    vsrl.vx v14, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v14, v0.t
+; RV32-NEXT:    vsrl.vx v14, v8, a3, v0.t
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    addi a4, a4, -241
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV32-NEXT:    vor.vv v14, v16, v14, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a4, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a4
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v14, v0.t
-; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v14, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT:    vand.vv v12, v12, v10, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v14, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v14, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1794,59 +1792,59 @@ define <vscale x 2 x i64> @vp_bitreverse_nxv2i64(<vscale x 2 x i64> %va, <vscale
 ; RV64-LABEL: vp_bitreverse_nxv2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a3, 255
+; RV64-NEXT:    li a2, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 209715
+; RV64-NEXT:    lui a7, 349525
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw a6, a6, 819
+; RV64-NEXT:    addiw a7, a7, 1365
+; RV64-NEXT:    slli t0, a5, 32
+; RV64-NEXT:    add t0, a5, t0
+; RV64-NEXT:    slli a5, a6, 32
+; RV64-NEXT:    add a6, a6, a5
+; RV64-NEXT:    slli a5, a7, 32
+; RV64-NEXT:    add a5, a7, a5
+; RV64-NEXT:    li a7, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
+; RV64-NEXT:    slli a3, a3, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v10, v10, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a3, v0.t
 ; RV64-NEXT:    vsll.vi v12, v12, 8, v0.t
 ; RV64-NEXT:    vor.vv v10, v10, v12, v0.t
-; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    vsll.vx v12, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v14, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v14, v14, a4, v0.t
+; RV64-NEXT:    vand.vx v14, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v14, v14, a7, v0.t
 ; RV64-NEXT:    vor.vv v12, v12, v14, v0.t
 ; RV64-NEXT:    vor.vv v10, v12, v10, v0.t
 ; RV64-NEXT:    vsrl.vx v12, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v14, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v14, v14, a3, v0.t
+; RV64-NEXT:    vsrl.vx v14, v8, a7, v0.t
+; RV64-NEXT:    vand.vx v14, v14, a0, v0.t
 ; RV64-NEXT:    vor.vv v12, v14, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v14, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v14, v14, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v14, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v10, t0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, t0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a5, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    ret
@@ -1866,67 +1864,67 @@ define <vscale x 2 x i64> @vp_bitreverse_nxv2i64_unmasked(<vscale x 2 x i64> %va
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v14, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    vsll.vx v12, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v10, v8, a2
+; RV32-NEXT:    vsrl.vx v16, v8, a4
+; RV32-NEXT:    vand.vx v18, v8, a1
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vor.vv v10, v16, v10
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vx v10, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v12, v12, a3
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v14, v8, a4
-; RV32-NEXT:    vsll.vi v14, v14, 24
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsll.vi v16, v16, 8
-; RV32-NEXT:    vor.vv v14, v14, v16
-; RV32-NEXT:    vor.vv v10, v10, v14
-; RV32-NEXT:    vsrl.vx v14, v8, a1
-; RV32-NEXT:    vsrl.vx v16, v8, a3
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vor.vv v14, v16, v14
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v8, v14
-; RV32-NEXT:    vor.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vsll.vx v18, v18, a4
+; RV32-NEXT:    vor.vv v12, v12, v18
+; RV32-NEXT:    vsrl.vi v18, v8, 8
+; RV32-NEXT:    vand.vx v14, v14, a5
+; RV32-NEXT:    vand.vv v18, v18, v16
+; RV32-NEXT:    vor.vv v14, v18, v14
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vsetvli a4, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v10, v14, v10
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v14, a2
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v10, v10, v16
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v8, v8, v14
+; RV32-NEXT:    vand.vv v10, v10, v14
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1936,59 +1934,59 @@ define <vscale x 2 x i64> @vp_bitreverse_nxv2i64_unmasked(<vscale x 2 x i64> %va
 ; RV64-LABEL: vp_bitreverse_nxv2i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsll.vi v10, v10, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v12, v8, a0
-; RV64-NEXT:    vsll.vi v12, v12, 8
-; RV64-NEXT:    vor.vv v10, v10, v12
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v12, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v14, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v14, v14, a4
-; RV64-NEXT:    vor.vv v12, v12, v14
-; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vsrl.vx v12, v8, a2
-; RV64-NEXT:    vsrl.vx v14, v8, a4
-; RV64-NEXT:    vand.vx v14, v14, a3
+; RV64-NEXT:    vsrl.vi v12, v8, 24
+; RV64-NEXT:    vsrl.vi v14, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v10, v8, a3
+; RV64-NEXT:    vsrl.vx v16, v8, a5
+; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vor.vv v10, v16, v10
+; RV64-NEXT:    vand.vx v16, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v12, v12, a1
+; RV64-NEXT:    vsll.vi v16, v16, 24
+; RV64-NEXT:    vand.vx v14, v14, a2
 ; RV64-NEXT:    vor.vv v12, v14, v12
-; RV64-NEXT:    vsrl.vi v14, v8, 24
-; RV64-NEXT:    vand.vx v14, v14, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v14, v8, a2
+; RV64-NEXT:    vsll.vi v14, v14, 8
+; RV64-NEXT:    vor.vv v14, v16, v14
+; RV64-NEXT:    vsll.vx v16, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v14
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vor.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    vor.vv v10, v12, v10
+; RV64-NEXT:    vor.vv v8, v8, v14
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v10, v10, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v10, v10, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    ret
@@ -2009,70 +2007,69 @@ define <vscale x 4 x i64> @vp_bitreverse_nxv4i64(<vscale x 4 x i64> %va, <vscale
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    lui a5, 16
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vx v12, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v16, v16, a3, v0.t
-; RV32-NEXT:    vor.vv v16, v12, v16, v0.t
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsetvli a4, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a6), zero
+; RV32-NEXT:    lui a4, 61681
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v16, v8, a3, v0.t
+; RV32-NEXT:    addi a5, a5, -256
+; RV32-NEXT:    vand.vx v20, v8, a5, v0.t
+; RV32-NEXT:    vsll.vx v20, v20, a2, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
+; RV32-NEXT:    vand.vx v20, v8, a1, v0.t
 ; RV32-NEXT:    vsll.vi v20, v20, 24, v0.t
 ; RV32-NEXT:    vand.vv v24, v8, v12, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV32-NEXT:    vor.vv v20, v20, v24, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vsrl.vx v20, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v20, v8, a3, v0.t
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    vsrl.vx v24, v8, a2, v0.t
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    addi a4, a4, -241
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
 ; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v20, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v28, a4
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vor.vv v20, v8, v20, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
+; RV32-NEXT:    vsrl.vi v20, v16, 4, v0.t
+; RV32-NEXT:    vand.vv v20, v20, v28, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v28, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 4, v0.t
+; RV32-NEXT:    vor.vv v16, v20, v16, v0.t
+; RV32-NEXT:    vsrl.vi v20, v16, 2, v0.t
+; RV32-NEXT:    vand.vv v20, v20, v12, v0.t
+; RV32-NEXT:    vand.vv v12, v16, v12, v0.t
+; RV32-NEXT:    vsll.vi v12, v12, 2, v0.t
+; RV32-NEXT:    vor.vv v12, v20, v12, v0.t
+; RV32-NEXT:    vsrl.vi v16, v12, 1, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
@@ -2080,59 +2077,59 @@ define <vscale x 4 x i64> @vp_bitreverse_nxv4i64(<vscale x 4 x i64> %va, <vscale
 ; RV64-LABEL: vp_bitreverse_nxv4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a3, 255
+; RV64-NEXT:    li a2, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 209715
+; RV64-NEXT:    lui a7, 349525
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw a6, a6, 819
+; RV64-NEXT:    addiw a7, a7, 1365
+; RV64-NEXT:    slli t0, a5, 32
+; RV64-NEXT:    add t0, a5, t0
+; RV64-NEXT:    slli a5, a6, 32
+; RV64-NEXT:    add a6, a6, a5
+; RV64-NEXT:    slli a5, a7, 32
+; RV64-NEXT:    add a5, a7, a5
+; RV64-NEXT:    li a7, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
+; RV64-NEXT:    slli a3, a3, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v12, v12, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
 ; RV64-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; RV64-NEXT:    vor.vv v12, v12, v16, v0.t
-; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v20, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v20, v20, a4, v0.t
+; RV64-NEXT:    vand.vx v20, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v20, v20, a7, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v20, v0.t
 ; RV64-NEXT:    vor.vv v12, v16, v12, v0.t
 ; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v20, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v20, v20, a3, v0.t
+; RV64-NEXT:    vsrl.vx v20, v8, a7, v0.t
+; RV64-NEXT:    vand.vx v20, v20, a0, v0.t
 ; RV64-NEXT:    vor.vv v16, v20, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v20, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v20, v20, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v20, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v12, t0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, t0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a5, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    ret
@@ -2152,67 +2149,67 @@ define <vscale x 4 x i64> @vp_bitreverse_nxv4i64_unmasked(<vscale x 4 x i64> %va
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v20, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    vsll.vx v16, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v12, v8, a2
+; RV32-NEXT:    vsrl.vx v24, v8, a4
+; RV32-NEXT:    vand.vx v28, v8, a1
+; RV32-NEXT:    vand.vx v24, v24, a1
+; RV32-NEXT:    vor.vv v12, v24, v12
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vx v12, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v16, v16, a3
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v8, a4
-; RV32-NEXT:    vsll.vi v20, v20, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v20, v20, v24
-; RV32-NEXT:    vor.vv v12, v12, v20
-; RV32-NEXT:    vsrl.vx v20, v8, a1
-; RV32-NEXT:    vsrl.vx v24, v8, a3
-; RV32-NEXT:    vand.vx v24, v24, a2
-; RV32-NEXT:    vor.vv v20, v24, v20
-; RV32-NEXT:    vsrl.vi v24, v8, 24
-; RV32-NEXT:    vand.vx v24, v24, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vor.vv v8, v8, v20
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vsll.vx v28, v28, a4
+; RV32-NEXT:    vor.vv v16, v16, v28
+; RV32-NEXT:    vsrl.vi v28, v8, 8
+; RV32-NEXT:    vand.vx v20, v20, a5
+; RV32-NEXT:    vand.vv v28, v28, v24
+; RV32-NEXT:    vor.vv v20, v28, v20
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v24, v24, 8
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsetvli a4, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v12, v20, v12
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v20, a2
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v12, v12, v24
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v20
+; RV32-NEXT:    vand.vv v12, v12, v20
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2222,59 +2219,59 @@ define <vscale x 4 x i64> @vp_bitreverse_nxv4i64_unmasked(<vscale x 4 x i64> %va
 ; RV64-LABEL: vp_bitreverse_nxv4i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vand.vx v12, v8, a1
-; RV64-NEXT:    vsll.vi v12, v12, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v16, v8, a0
-; RV64-NEXT:    vsll.vi v16, v16, 8
-; RV64-NEXT:    vor.vv v12, v12, v16
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v16, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v20, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v20, v20, a4
-; RV64-NEXT:    vor.vv v16, v16, v20
-; RV64-NEXT:    vor.vv v12, v16, v12
-; RV64-NEXT:    vsrl.vx v16, v8, a2
-; RV64-NEXT:    vsrl.vx v20, v8, a4
-; RV64-NEXT:    vand.vx v20, v20, a3
+; RV64-NEXT:    vsrl.vi v16, v8, 24
+; RV64-NEXT:    vsrl.vi v20, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v12, v8, a3
+; RV64-NEXT:    vsrl.vx v24, v8, a5
+; RV64-NEXT:    vand.vx v24, v24, a0
+; RV64-NEXT:    vor.vv v12, v24, v12
+; RV64-NEXT:    vand.vx v24, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vsll.vi v24, v24, 24
+; RV64-NEXT:    vand.vx v20, v20, a2
 ; RV64-NEXT:    vor.vv v16, v20, v16
-; RV64-NEXT:    vsrl.vi v20, v8, 24
-; RV64-NEXT:    vand.vx v20, v20, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v20, v8, a2
+; RV64-NEXT:    vsll.vi v20, v20, 8
+; RV64-NEXT:    vor.vv v20, v24, v20
+; RV64-NEXT:    vsll.vx v24, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v20
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vor.vv v8, v12, v8
-; RV64-NEXT:    vsrl.vi v12, v8, 4
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v24, v8
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    vor.vv v12, v16, v12
+; RV64-NEXT:    vor.vv v8, v8, v20
+; RV64-NEXT:    vor.vv v8, v8, v12
+; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v12, v12, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v12, v12, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    ret
@@ -2301,33 +2298,33 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    addi a5, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
@@ -2346,14 +2343,14 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
@@ -2365,42 +2362,42 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    vsll.vi v24, v24, 4, v0.t
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
@@ -2419,66 +2416,65 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV64-NEXT:    addi a5, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
@@ -2504,82 +2500,89 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64_unmasked(<vscale x 7 x i64> %va
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3
+; RV32-NEXT:    vsll.vx v16, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    vsrl.vx v0, v8, a4
+; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v24, v8, a1
+; RV32-NEXT:    vsll.vx v24, v24, a4
 ; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v0, v8, a4
-; RV32-NEXT:    vsll.vi v0, v0, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    addi a5, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v0, v8, a3
-; RV32-NEXT:    vand.vx v0, v0, a2
-; RV32-NEXT:    vsrl.vx v24, v8, a1
-; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a5
 ; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v0, v0, v24
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v24, v24, 8
+; RV32-NEXT:    vor.vv v24, v8, v24
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2588,62 +2591,78 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64_unmasked(<vscale x 7 x i64> %va
 ;
 ; RV64-LABEL: vp_bitreverse_nxv7i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0
-; RV64-NEXT:    vsll.vi v24, v24, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v0, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v0, v0, a4
-; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vx v24, v8, a2
-; RV64-NEXT:    vsrl.vx v0, v8, a4
-; RV64-NEXT:    vand.vx v0, v0, a3
+; RV64-NEXT:    vsrl.vi v24, v8, 24
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v16, v8, a3
+; RV64-NEXT:    vsrl.vx v0, v8, a5
+; RV64-NEXT:    vand.vx v0, v0, a0
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v0, v8, 8
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v0, v0, a2
 ; RV64-NEXT:    vor.vv v24, v0, v24
-; RV64-NEXT:    vsrl.vi v0, v8, 24
-; RV64-NEXT:    vand.vx v0, v0, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v0, v8, a1
+; RV64-NEXT:    vsll.vi v0, v0, 24
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v0, v0, v16
+; RV64-NEXT:    vsll.vx v16, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v0
-; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsll.vx v8, v8, a5
 ; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vor.vv v8, v8, v0
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v16, v16, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_bitreverse_nxv7i64_unmasked:
@@ -2668,33 +2687,33 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    addi a5, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
@@ -2713,14 +2732,14 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
@@ -2732,42 +2751,42 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
+; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    vsll.vi v24, v24, 4, v0.t
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
@@ -2786,66 +2805,65 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV64-NEXT:    addi a5, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
@@ -2871,82 +2889,89 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64_unmasked(<vscale x 8 x i64> %va
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3
+; RV32-NEXT:    vsll.vx v16, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    vsrl.vx v0, v8, a4
+; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v24, v8, a1
+; RV32-NEXT:    vsll.vx v24, v24, a4
 ; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v0, v8, a4
-; RV32-NEXT:    vsll.vi v0, v0, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    addi a5, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v0, v8, a3
-; RV32-NEXT:    vand.vx v0, v0, a2
-; RV32-NEXT:    vsrl.vx v24, v8, a1
-; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a5
 ; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v0, v0, v24
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v24, v24, 8
+; RV32-NEXT:    vor.vv v24, v8, v24
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 4
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a2
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2955,62 +2980,78 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64_unmasked(<vscale x 8 x i64> %va
 ;
 ; RV64-LABEL: vp_bitreverse_nxv8i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0
-; RV64-NEXT:    vsll.vi v24, v24, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v0, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v0, v0, a4
-; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vx v24, v8, a2
-; RV64-NEXT:    vsrl.vx v0, v8, a4
-; RV64-NEXT:    vand.vx v0, v0, a3
+; RV64-NEXT:    vsrl.vi v24, v8, 24
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v16, v8, a3
+; RV64-NEXT:    vsrl.vx v0, v8, a5
+; RV64-NEXT:    vand.vx v0, v0, a0
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v0, v8, 8
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v0, v0, a2
 ; RV64-NEXT:    vor.vv v24, v0, v24
-; RV64-NEXT:    vsrl.vi v0, v8, 24
-; RV64-NEXT:    vand.vx v0, v0, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v0, v8, a1
+; RV64-NEXT:    vsll.vi v0, v0, 24
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v0, v0, v16
+; RV64-NEXT:    vsll.vx v16, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v0
-; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsll.vx v8, v8, a5
 ; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vor.vv v8, v8, v0
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v16, v16, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_bitreverse_nxv8i64_unmasked:
@@ -3040,69 +3081,69 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vsc
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    srli a1, a2, 1
-; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    slli a2, a2, 2
-; CHECK-NEXT:    sub a1, a0, a2
-; CHECK-NEXT:    sltu a3, a0, a1
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a1, a3, a1
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    lui a1, 1
+; CHECK-NEXT:    lui a2, 3
+; CHECK-NEXT:    srli a4, a3, 1
+; CHECK-NEXT:    slli a3, a3, 2
+; CHECK-NEXT:    vsetvli a5, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a4
+; CHECK-NEXT:    sub a4, a0, a3
+; CHECK-NEXT:    sltu a5, a0, a4
+; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    and a5, a5, a4
+; CHECK-NEXT:    lui a6, 5
+; CHECK-NEXT:    addi a4, a1, -241
+; CHECK-NEXT:    addi a2, a2, 819
+; CHECK-NEXT:    addi a1, a6, 1365
+; CHECK-NEXT:    vsetvli zero, a5, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v16, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v16, v16, 8, v0.t
-; CHECK-NEXT:    vor.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v16, 4, v0.t
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    addi a1, a1, -241
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
-; CHECK-NEXT:    vsll.vi v16, v16, 4, v0.t
-; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v16, 2, v0.t
-; CHECK-NEXT:    lui a3, 3
-; CHECK-NEXT:    addi a3, a3, 819
-; CHECK-NEXT:    vand.vx v8, v8, a3, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a3, v0.t
-; CHECK-NEXT:    vsll.vi v16, v16, 2, v0.t
-; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v16, 1, v0.t
-; CHECK-NEXT:    lui a4, 5
-; CHECK-NEXT:    addi a4, a4, 1365
-; CHECK-NEXT:    vand.vx v8, v8, a4, v0.t
+; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a4, v0.t
-; CHECK-NEXT:    vsll.vi v16, v16, 1, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a4, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
+; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a2, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
+; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    addi a5, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    bltu a0, a2, .LBB46_2
+; CHECK-NEXT:    bltu a0, a3, .LBB46_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a0, a3
 ; CHECK-NEXT:  .LBB46_2:
 ; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a4, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a4, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a3, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a3, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a2, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a4, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a4, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    addi a0, sp, 16
@@ -3120,9 +3161,9 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vsc
 ; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 1
+; CHECK-ZVBB-NEXT:    slli a1, a1, 2
 ; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    slli a1, a1, 2
 ; CHECK-ZVBB-NEXT:    sub a2, a0, a1
 ; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
 ; CHECK-ZVBB-NEXT:    addi a3, a3, -1
@@ -3144,58 +3185,58 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vsc
 define <vscale x 64 x i16> @vp_bitreverse_nxv64i16_unmasked(<vscale x 64 x i16> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_bitreverse_nxv64i16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 2
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    lui a1, 1
+; CHECK-NEXT:    lui a2, 3
+; CHECK-NEXT:    slli a3, a3, 2
+; CHECK-NEXT:    sub a4, a0, a3
+; CHECK-NEXT:    sltu a5, a0, a4
+; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    and a5, a5, a4
+; CHECK-NEXT:    lui a6, 5
+; CHECK-NEXT:    addi a4, a1, -241
+; CHECK-NEXT:    addi a2, a2, 819
+; CHECK-NEXT:    addi a1, a6, 1365
+; CHECK-NEXT:    vsetvli zero, a5, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v24, v16, 8
 ; CHECK-NEXT:    vsll.vi v16, v16, 8
 ; CHECK-NEXT:    vor.vv v16, v16, v24
 ; CHECK-NEXT:    vsrl.vi v24, v16, 4
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    addi a2, a2, -241
-; CHECK-NEXT:    vand.vx v24, v24, a2
-; CHECK-NEXT:    vand.vx v16, v16, a2
+; CHECK-NEXT:    vand.vx v16, v16, a4
+; CHECK-NEXT:    vand.vx v24, v24, a4
 ; CHECK-NEXT:    vsll.vi v16, v16, 4
 ; CHECK-NEXT:    vor.vv v16, v24, v16
 ; CHECK-NEXT:    vsrl.vi v24, v16, 2
-; CHECK-NEXT:    lui a3, 3
-; CHECK-NEXT:    addi a3, a3, 819
-; CHECK-NEXT:    vand.vx v24, v24, a3
-; CHECK-NEXT:    vand.vx v16, v16, a3
+; CHECK-NEXT:    vand.vx v16, v16, a2
+; CHECK-NEXT:    vand.vx v24, v24, a2
 ; CHECK-NEXT:    vsll.vi v16, v16, 2
 ; CHECK-NEXT:    vor.vv v16, v24, v16
 ; CHECK-NEXT:    vsrl.vi v24, v16, 1
-; CHECK-NEXT:    lui a4, 5
-; CHECK-NEXT:    addi a4, a4, 1365
-; CHECK-NEXT:    vand.vx v24, v24, a4
-; CHECK-NEXT:    vand.vx v16, v16, a4
+; CHECK-NEXT:    vand.vx v16, v16, a1
+; CHECK-NEXT:    vand.vx v24, v24, a1
 ; CHECK-NEXT:    vadd.vv v16, v16, v16
 ; CHECK-NEXT:    vor.vv v16, v24, v16
-; CHECK-NEXT:    bltu a0, a1, .LBB47_2
+; CHECK-NEXT:    bltu a0, a3, .LBB47_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:    mv a0, a3
 ; CHECK-NEXT:  .LBB47_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v24, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
 ; CHECK-NEXT:    vor.vv v8, v8, v24
 ; CHECK-NEXT:    vsrl.vi v24, v8, 4
-; CHECK-NEXT:    vand.vx v24, v24, a2
-; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vand.vx v8, v8, a4
+; CHECK-NEXT:    vand.vx v24, v24, a4
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    vsrl.vi v24, v8, 2
-; CHECK-NEXT:    vand.vx v24, v24, a3
-; CHECK-NEXT:    vand.vx v8, v8, a3
+; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vand.vx v24, v24, a2
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    vsrl.vi v24, v8, 1
-; CHECK-NEXT:    vand.vx v24, v24, a4
-; CHECK-NEXT:    vand.vx v8, v8, a4
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v24, v24, a1
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    ret
@@ -3228,25 +3269,25 @@ define <vscale x 1 x i9> @vp_bitreverse_nxv1i9(<vscale x 1 x i9> %va, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
index 24c6b32cbfa963..2cd763afa36b73 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-sdnode.ll
@@ -124,9 +124,9 @@ define <vscale x 1 x i32> @bswap_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsll.vi v10, v10, 8
@@ -151,9 +151,9 @@ define <vscale x 2 x i32> @bswap_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsll.vi v10, v10, 8
@@ -178,9 +178,9 @@ define <vscale x 4 x i32> @bswap_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    vor.vv v10, v10, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsll.vi v12, v12, 8
@@ -205,9 +205,9 @@ define <vscale x 8 x i32> @bswap_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    vor.vv v12, v12, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsll.vi v16, v16, 8
@@ -232,9 +232,9 @@ define <vscale x 16 x i32> @bswap_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v24, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsrl.vi v24, v8, 24
 ; CHECK-NEXT:    vor.vv v16, v16, v24
 ; CHECK-NEXT:    vand.vx v24, v8, a0
 ; CHECK-NEXT:    vsll.vi v24, v24, 8
@@ -259,36 +259,36 @@ define <vscale x 1 x i64> @bswap_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a0, 1044480
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v9, v8, 24
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    vsrl.vx v10, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v10, a2
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsrl.vi v10, v8, 24
-; RV32-NEXT:    addi a3, sp, 8
-; RV32-NEXT:    vlse64.v v11, (a3), zero
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v10, v10, a3
+; RV32-NEXT:    vsrl.vx v11, v8, a2
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vsll.vx v12, v8, a1
+; RV32-NEXT:    vand.vx v11, v11, a0
+; RV32-NEXT:    vlse64.v v13, (a5), zero
+; RV32-NEXT:    vor.vv v10, v11, v10
+; RV32-NEXT:    vand.vx v11, v8, a0
+; RV32-NEXT:    vsll.vx v11, v11, a2
+; RV32-NEXT:    vor.vv v11, v12, v11
 ; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vand.vv v12, v12, v11
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsll.vx v10, v8, a0
-; RV32-NEXT:    vand.vx v12, v8, a2
-; RV32-NEXT:    vsll.vx v12, v12, a1
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    vand.vx v12, v8, a3
-; RV32-NEXT:    vsll.vi v12, v12, 24
-; RV32-NEXT:    vand.vv v8, v8, v11
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vand.vx v9, v9, a4
+; RV32-NEXT:    vand.vv v12, v12, v13
+; RV32-NEXT:    vor.vv v9, v12, v9
+; RV32-NEXT:    vand.vv v12, v8, v13
+; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v12, v12, 8
+; RV32-NEXT:    vor.vv v9, v9, v10
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v8, v11, v8
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -297,28 +297,28 @@ define <vscale x 1 x i64> @bswap_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64-LABEL: bswap_nxv1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vx v9, v8, a0
 ; RV64-NEXT:    li a1, 40
-; RV64-NEXT:    vsrl.vx v10, v8, a1
 ; RV64-NEXT:    lui a2, 16
-; RV64-NEXT:    addiw a2, a2, -256
-; RV64-NEXT:    vand.vx v10, v10, a2
-; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsrl.vi v10, v8, 24
+; RV64-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
+; RV64-NEXT:    vsrl.vi v9, v8, 24
 ; RV64-NEXT:    lui a3, 4080
-; RV64-NEXT:    vand.vx v10, v10, a3
+; RV64-NEXT:    vsrl.vx v10, v8, a0
+; RV64-NEXT:    vsrl.vx v11, v8, a1
+; RV64-NEXT:    addiw a2, a2, -256
+; RV64-NEXT:    vand.vx v11, v11, a2
+; RV64-NEXT:    vor.vv v10, v11, v10
 ; RV64-NEXT:    vsrl.vi v11, v8, 8
 ; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    vand.vx v9, v9, a3
 ; RV64-NEXT:    slli a4, a4, 24
 ; RV64-NEXT:    vand.vx v11, v11, a4
+; RV64-NEXT:    vor.vv v9, v11, v9
+; RV64-NEXT:    vand.vx v11, v8, a3
+; RV64-NEXT:    vsll.vi v11, v11, 24
+; RV64-NEXT:    vor.vv v9, v9, v10
+; RV64-NEXT:    vand.vx v10, v8, a4
+; RV64-NEXT:    vsll.vi v10, v10, 8
 ; RV64-NEXT:    vor.vv v10, v11, v10
-; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vand.vx v10, v8, a3
-; RV64-NEXT:    vsll.vi v10, v10, 24
-; RV64-NEXT:    vand.vx v11, v8, a4
-; RV64-NEXT:    vsll.vi v11, v11, 8
-; RV64-NEXT:    vor.vv v10, v10, v11
 ; RV64-NEXT:    vsll.vx v11, v8, a0
 ; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vsll.vx v8, v8, a1
@@ -343,36 +343,36 @@ define <vscale x 2 x i64> @bswap_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a0, 1044480
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    vsetvli a4, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v8, 24
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vx v10, v8, a0
-; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    vsrl.vx v12, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsrl.vi v12, v8, 24
-; RV32-NEXT:    addi a3, sp, 8
-; RV32-NEXT:    vlse64.v v14, (a3), zero
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v12, v12, a3
+; RV32-NEXT:    vsrl.vx v14, v8, a2
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vsll.vx v16, v8, a1
+; RV32-NEXT:    vand.vx v14, v14, a0
+; RV32-NEXT:    vlse64.v v18, (a5), zero
+; RV32-NEXT:    vor.vv v12, v14, v12
+; RV32-NEXT:    vand.vx v14, v8, a0
+; RV32-NEXT:    vsll.vx v14, v14, a2
+; RV32-NEXT:    vor.vv v14, v16, v14
 ; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vand.vv v16, v16, v14
-; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsll.vx v12, v8, a0
-; RV32-NEXT:    vand.vx v16, v8, a2
-; RV32-NEXT:    vsll.vx v16, v16, a1
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    vand.vx v16, v8, a3
-; RV32-NEXT:    vsll.vi v16, v16, 24
-; RV32-NEXT:    vand.vv v8, v8, v14
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vand.vx v10, v10, a4
+; RV32-NEXT:    vand.vv v16, v16, v18
+; RV32-NEXT:    vor.vv v10, v16, v10
+; RV32-NEXT:    vand.vv v16, v8, v18
+; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v10, v10, v12
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v14, v8
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -381,28 +381,28 @@ define <vscale x 2 x i64> @bswap_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64-LABEL: bswap_nxv2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vx v10, v8, a0
 ; RV64-NEXT:    li a1, 40
-; RV64-NEXT:    vsrl.vx v12, v8, a1
 ; RV64-NEXT:    lui a2, 16
-; RV64-NEXT:    addiw a2, a2, -256
-; RV64-NEXT:    vand.vx v12, v12, a2
-; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vsrl.vi v12, v8, 24
+; RV64-NEXT:    vsetvli a3, zero, e64, m2, ta, ma
+; RV64-NEXT:    vsrl.vi v10, v8, 24
 ; RV64-NEXT:    lui a3, 4080
-; RV64-NEXT:    vand.vx v12, v12, a3
+; RV64-NEXT:    vsrl.vx v12, v8, a0
+; RV64-NEXT:    vsrl.vx v14, v8, a1
+; RV64-NEXT:    addiw a2, a2, -256
+; RV64-NEXT:    vand.vx v14, v14, a2
+; RV64-NEXT:    vor.vv v12, v14, v12
 ; RV64-NEXT:    vsrl.vi v14, v8, 8
 ; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    vand.vx v10, v10, a3
 ; RV64-NEXT:    slli a4, a4, 24
 ; RV64-NEXT:    vand.vx v14, v14, a4
+; RV64-NEXT:    vor.vv v10, v14, v10
+; RV64-NEXT:    vand.vx v14, v8, a3
+; RV64-NEXT:    vsll.vi v14, v14, 24
+; RV64-NEXT:    vor.vv v10, v10, v12
+; RV64-NEXT:    vand.vx v12, v8, a4
+; RV64-NEXT:    vsll.vi v12, v12, 8
 ; RV64-NEXT:    vor.vv v12, v14, v12
-; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vand.vx v12, v8, a3
-; RV64-NEXT:    vsll.vi v12, v12, 24
-; RV64-NEXT:    vand.vx v14, v8, a4
-; RV64-NEXT:    vsll.vi v14, v14, 8
-; RV64-NEXT:    vor.vv v12, v12, v14
 ; RV64-NEXT:    vsll.vx v14, v8, a0
 ; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vsll.vx v8, v8, a1
@@ -427,36 +427,36 @@ define <vscale x 4 x i64> @bswap_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a0, 1044480
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    vsetvli a4, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v8, 24
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vx v12, v8, a0
-; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    addi a3, sp, 8
-; RV32-NEXT:    vlse64.v v20, (a3), zero
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v16, v16, a3
+; RV32-NEXT:    vsrl.vx v20, v8, a2
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vsll.vx v24, v8, a1
+; RV32-NEXT:    vand.vx v20, v20, a0
+; RV32-NEXT:    vlse64.v v28, (a5), zero
+; RV32-NEXT:    vor.vv v16, v20, v16
+; RV32-NEXT:    vand.vx v20, v8, a0
+; RV32-NEXT:    vsll.vx v20, v20, a2
+; RV32-NEXT:    vor.vv v20, v24, v20
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v24, v24, v20
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vsll.vx v16, v8, a0
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    vsll.vx v24, v24, a1
-; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    vand.vx v24, v8, a3
-; RV32-NEXT:    vsll.vi v24, v24, 24
-; RV32-NEXT:    vand.vv v8, v8, v20
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vand.vx v12, v12, a4
+; RV32-NEXT:    vand.vv v24, v24, v28
+; RV32-NEXT:    vor.vv v12, v24, v12
+; RV32-NEXT:    vand.vv v24, v8, v28
+; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v24, v24, 8
+; RV32-NEXT:    vor.vv v12, v12, v16
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vor.vv v8, v20, v8
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -465,28 +465,28 @@ define <vscale x 4 x i64> @bswap_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64-LABEL: bswap_nxv4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vx v12, v8, a0
 ; RV64-NEXT:    li a1, 40
-; RV64-NEXT:    vsrl.vx v16, v8, a1
 ; RV64-NEXT:    lui a2, 16
-; RV64-NEXT:    addiw a2, a2, -256
-; RV64-NEXT:    vand.vx v16, v16, a2
-; RV64-NEXT:    vor.vv v12, v16, v12
-; RV64-NEXT:    vsrl.vi v16, v8, 24
+; RV64-NEXT:    vsetvli a3, zero, e64, m4, ta, ma
+; RV64-NEXT:    vsrl.vi v12, v8, 24
 ; RV64-NEXT:    lui a3, 4080
-; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vsrl.vx v16, v8, a0
+; RV64-NEXT:    vsrl.vx v20, v8, a1
+; RV64-NEXT:    addiw a2, a2, -256
+; RV64-NEXT:    vand.vx v20, v20, a2
+; RV64-NEXT:    vor.vv v16, v20, v16
 ; RV64-NEXT:    vsrl.vi v20, v8, 8
 ; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    vand.vx v12, v12, a3
 ; RV64-NEXT:    slli a4, a4, 24
 ; RV64-NEXT:    vand.vx v20, v20, a4
+; RV64-NEXT:    vor.vv v12, v20, v12
+; RV64-NEXT:    vand.vx v20, v8, a3
+; RV64-NEXT:    vsll.vi v20, v20, 24
+; RV64-NEXT:    vor.vv v12, v12, v16
+; RV64-NEXT:    vand.vx v16, v8, a4
+; RV64-NEXT:    vsll.vi v16, v16, 8
 ; RV64-NEXT:    vor.vv v16, v20, v16
-; RV64-NEXT:    vor.vv v12, v16, v12
-; RV64-NEXT:    vand.vx v16, v8, a3
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    vand.vx v20, v8, a4
-; RV64-NEXT:    vsll.vi v20, v20, 8
-; RV64-NEXT:    vor.vv v16, v16, v20
 ; RV64-NEXT:    vsll.vx v20, v8, a0
 ; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vsll.vx v8, v8, a1
@@ -511,50 +511,57 @@ define <vscale x 8 x i64> @bswap_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    sub sp, sp, a0
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a0, 1044480
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a0, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v8, a0
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    vsrl.vx v24, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v24, a2
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v16, v8, a1
+; RV32-NEXT:    vsrl.vx v24, v8, a2
+; RV32-NEXT:    addi a0, a3, -256
+; RV32-NEXT:    vsll.vx v0, v8, a1
+; RV32-NEXT:    vand.vx v24, v24, a0
 ; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v0, v8, 24
-; RV32-NEXT:    addi a3, sp, 8
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    lui a3, 4080
-; RV32-NEXT:    vand.vx v0, v0, a3
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v16, v8, a0
+; RV32-NEXT:    vsll.vx v16, v16, a2
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v0, (a5), zero
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a4
 ; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vx v0, v8, a2
-; RV32-NEXT:    vsll.vx v0, v0, a1
-; RV32-NEXT:    vsll.vx v24, v8, a0
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vand.vv v16, v8, v16
-; RV32-NEXT:    vand.vx v8, v8, a3
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v24
+; RV32-NEXT:    vand.vv v16, v8, v0
+; RV32-NEXT:    vand.vx v8, v8, a4
 ; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v16, v16, 8
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -564,28 +571,28 @@ define <vscale x 8 x i64> @bswap_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64-LABEL: bswap_nxv8i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vx v16, v8, a0
 ; RV64-NEXT:    li a1, 40
-; RV64-NEXT:    vsrl.vx v24, v8, a1
 ; RV64-NEXT:    lui a2, 16
-; RV64-NEXT:    addiw a2, a2, -256
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 24
 ; RV64-NEXT:    lui a3, 4080
-; RV64-NEXT:    vand.vx v24, v24, a3
+; RV64-NEXT:    vsrl.vx v16, v8, a0
+; RV64-NEXT:    vsrl.vx v0, v8, a1
+; RV64-NEXT:    addiw a2, a2, -256
+; RV64-NEXT:    vand.vx v0, v0, a2
+; RV64-NEXT:    vor.vv v16, v0, v16
 ; RV64-NEXT:    vsrl.vi v0, v8, 8
 ; RV64-NEXT:    li a4, 255
+; RV64-NEXT:    vand.vx v24, v24, a3
 ; RV64-NEXT:    slli a4, a4, 24
 ; RV64-NEXT:    vand.vx v0, v0, a4
 ; RV64-NEXT:    vor.vv v24, v0, v24
+; RV64-NEXT:    vand.vx v0, v8, a3
+; RV64-NEXT:    vsll.vi v0, v0, 24
 ; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsll.vi v24, v24, 24
-; RV64-NEXT:    vand.vx v0, v8, a4
-; RV64-NEXT:    vsll.vi v0, v0, 8
-; RV64-NEXT:    vor.vv v24, v24, v0
+; RV64-NEXT:    vand.vx v24, v8, a4
+; RV64-NEXT:    vsll.vi v24, v24, 8
+; RV64-NEXT:    vor.vv v24, v0, v24
 ; RV64-NEXT:    vsll.vx v0, v8, a0
 ; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vsll.vx v8, v8, a1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
index 2c5b7f160d1924..1c95ec8fafd4f1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
@@ -270,9 +270,9 @@ define <vscale x 1 x i32> @vp_bswap_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsll.vi v10, v10, 8
@@ -324,9 +324,9 @@ define <vscale x 2 x i32> @vp_bswap_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsll.vi v10, v10, 8
@@ -378,9 +378,9 @@ define <vscale x 4 x i32> @vp_bswap_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    vor.vv v10, v10, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsll.vi v12, v12, 8
@@ -432,9 +432,9 @@ define <vscale x 8 x i32> @vp_bswap_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    vor.vv v12, v12, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsll.vi v16, v16, 8
@@ -486,9 +486,9 @@ define <vscale x 16 x i32> @vp_bswap_nxv16i32_unmasked(<vscale x 16 x i32> %va,
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v24, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsrl.vi v24, v8, 24
 ; CHECK-NEXT:    vor.vv v16, v16, v24
 ; CHECK-NEXT:    vand.vx v24, v8, a0
 ; CHECK-NEXT:    vsll.vi v24, v24, 8
@@ -514,38 +514,38 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v9, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v10, v10, a3, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsll.vx v9, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v10, v8, a1, v0.t
+; RV32-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v11, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v11, v11, 24, v0.t
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
+; RV32-NEXT:    vsll.vx v10, v10, a4, v0.t
+; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vand.vx v10, v8, a5, v0.t
+; RV32-NEXT:    vsll.vi v10, v10, 24, v0.t
+; RV32-NEXT:    vand.vv v12, v8, v11, v0.t
 ; RV32-NEXT:    vsll.vi v12, v12, 8, v0.t
-; RV32-NEXT:    vor.vv v11, v11, v12, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v11, v0.t
-; RV32-NEXT:    vsrl.vx v11, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v12, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a2, v0.t
-; RV32-NEXT:    vor.vv v11, v12, v11, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsrl.vx v10, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v12, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV32-NEXT:    vor.vv v10, v12, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a4, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v11, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -554,31 +554,31 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
 ; RV64-LABEL: vp_bswap_nxv1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v9, v9, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v10, v10, 8, v0.t
 ; RV64-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v10, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v11, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v11, v11, a4, v0.t
+; RV64-NEXT:    vsll.vx v10, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v11, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v11, v11, a5, v0.t
 ; RV64-NEXT:    vor.vv v10, v10, v11, v0.t
 ; RV64-NEXT:    vor.vv v9, v10, v9, v0.t
-; RV64-NEXT:    vsrl.vx v10, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v11, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v11, v11, a3, v0.t
+; RV64-NEXT:    vsrl.vx v10, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v11, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v11, v11, a0, v0.t
 ; RV64-NEXT:    vor.vv v10, v11, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v11, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v11, v11, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v11, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
@@ -599,39 +599,39 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v9, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v9, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v10, v10, a3
-; RV32-NEXT:    vor.vv v9, v9, v10
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsll.vx v10, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v11, v8, a2
+; RV32-NEXT:    vsrl.vx v12, v8, a4
+; RV32-NEXT:    vand.vx v13, v8, a1
+; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vor.vv v11, v12, v11
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v8, a4
-; RV32-NEXT:    vsll.vi v11, v11, 24
-; RV32-NEXT:    vand.vv v12, v8, v10
+; RV32-NEXT:    vsll.vx v13, v13, a4
+; RV32-NEXT:    vor.vv v10, v10, v13
+; RV32-NEXT:    vsrl.vi v13, v8, 8
+; RV32-NEXT:    vand.vx v9, v9, a5
+; RV32-NEXT:    vand.vv v13, v13, v12
+; RV32-NEXT:    vor.vv v9, v13, v9
+; RV32-NEXT:    vand.vv v12, v8, v12
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v12, v12, 8
-; RV32-NEXT:    vor.vv v11, v11, v12
-; RV32-NEXT:    vor.vv v9, v9, v11
-; RV32-NEXT:    vsrl.vx v11, v8, a1
-; RV32-NEXT:    vsrl.vx v12, v8, a3
-; RV32-NEXT:    vand.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v11, v12, v11
-; RV32-NEXT:    vsrl.vi v12, v8, 24
-; RV32-NEXT:    vand.vx v12, v12, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v8, v11
-; RV32-NEXT:    vor.vv v8, v9, v8
+; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vor.vv v9, v9, v11
+; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
@@ -639,34 +639,34 @@ define <vscale x 1 x i64> @vp_bswap_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ; RV64-LABEL: vp_bswap_nxv1i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vand.vx v9, v8, a1
-; RV64-NEXT:    vsll.vi v9, v9, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v10, v8, a0
-; RV64-NEXT:    vsll.vi v10, v10, 8
-; RV64-NEXT:    vor.vv v9, v9, v10
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v10, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v11, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v11, v11, a4
-; RV64-NEXT:    vor.vv v10, v10, v11
+; RV64-NEXT:    vsrl.vi v9, v8, 24
+; RV64-NEXT:    vsrl.vi v10, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v11, v8, a3
+; RV64-NEXT:    vsrl.vx v12, v8, a5
+; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    vor.vv v11, v12, v11
+; RV64-NEXT:    vand.vx v12, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v9, v9, a1
+; RV64-NEXT:    vsll.vi v12, v12, 24
+; RV64-NEXT:    vand.vx v10, v10, a2
 ; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsrl.vx v10, v8, a2
-; RV64-NEXT:    vsrl.vx v11, v8, a4
-; RV64-NEXT:    vand.vx v11, v11, a3
-; RV64-NEXT:    vor.vv v10, v11, v10
-; RV64-NEXT:    vsrl.vi v11, v8, 24
-; RV64-NEXT:    vand.vx v11, v11, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v10, v8, a2
+; RV64-NEXT:    vsll.vi v10, v10, 8
+; RV64-NEXT:    vor.vv v10, v12, v10
+; RV64-NEXT:    vsll.vx v12, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v11
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vor.vv v8, v9, v8
+; RV64-NEXT:    vor.vv v9, v9, v11
+; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vp_bswap_nxv1i64_unmasked:
@@ -686,38 +686,38 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vx v10, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v12, v12, a3, v0.t
-; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsll.vx v10, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v12, v8, a1, v0.t
+; RV32-NEXT:    vsetvli a3, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v14, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v14, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v14, v14, 24, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
+; RV32-NEXT:    vsll.vx v12, v12, a4, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vand.vx v12, v8, a5, v0.t
+; RV32-NEXT:    vsll.vi v12, v12, 24, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v14, v0.t
 ; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT:    vor.vv v14, v14, v16, v0.t
-; RV32-NEXT:    vor.vv v10, v10, v14, v0.t
-; RV32-NEXT:    vsrl.vx v14, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV32-NEXT:    vor.vv v14, v16, v14, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vsrl.vx v12, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV32-NEXT:    vor.vv v12, v16, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a4, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v14, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v14, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -726,31 +726,31 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
 ; RV64-LABEL: vp_bswap_nxv2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v10, v10, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v12, v12, 8, v0.t
 ; RV64-NEXT:    vor.vv v10, v10, v12, v0.t
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v12, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v14, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v14, v14, a4, v0.t
+; RV64-NEXT:    vsll.vx v12, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v14, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v14, v14, a5, v0.t
 ; RV64-NEXT:    vor.vv v12, v12, v14, v0.t
 ; RV64-NEXT:    vor.vv v10, v12, v10, v0.t
-; RV64-NEXT:    vsrl.vx v12, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v14, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v14, v14, a3, v0.t
+; RV64-NEXT:    vsrl.vx v12, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v14, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v14, v14, a0, v0.t
 ; RV64-NEXT:    vor.vv v12, v14, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v14, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v14, v14, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v14, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
@@ -771,39 +771,39 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vx v10, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v12, v12, a3
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsll.vx v12, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v14, v8, a2
+; RV32-NEXT:    vsrl.vx v16, v8, a4
+; RV32-NEXT:    vand.vx v18, v8, a1
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vor.vv v14, v16, v14
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v14, v8, a4
-; RV32-NEXT:    vsll.vi v14, v14, 24
-; RV32-NEXT:    vand.vv v16, v8, v12
+; RV32-NEXT:    vsll.vx v18, v18, a4
+; RV32-NEXT:    vor.vv v12, v12, v18
+; RV32-NEXT:    vsrl.vi v18, v8, 8
+; RV32-NEXT:    vand.vx v10, v10, a5
+; RV32-NEXT:    vand.vv v18, v18, v16
+; RV32-NEXT:    vor.vv v10, v18, v10
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v16, v16, 8
-; RV32-NEXT:    vor.vv v14, v14, v16
-; RV32-NEXT:    vor.vv v10, v10, v14
-; RV32-NEXT:    vsrl.vx v14, v8, a1
-; RV32-NEXT:    vsrl.vx v16, v8, a3
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vor.vv v14, v16, v14
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v8, v14
-; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vor.vv v10, v10, v14
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
@@ -811,34 +811,34 @@ define <vscale x 2 x i64> @vp_bswap_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ; RV64-LABEL: vp_bswap_nxv2i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsll.vi v10, v10, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v12, v8, a0
-; RV64-NEXT:    vsll.vi v12, v12, 8
-; RV64-NEXT:    vor.vv v10, v10, v12
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v12, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v14, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v14, v14, a4
-; RV64-NEXT:    vor.vv v12, v12, v14
+; RV64-NEXT:    vsrl.vi v10, v8, 24
+; RV64-NEXT:    vsrl.vi v12, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v14, v8, a3
+; RV64-NEXT:    vsrl.vx v16, v8, a5
+; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vor.vv v14, v16, v14
+; RV64-NEXT:    vand.vx v16, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v10, v10, a1
+; RV64-NEXT:    vsll.vi v16, v16, 24
+; RV64-NEXT:    vand.vx v12, v12, a2
 ; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vsrl.vx v12, v8, a2
-; RV64-NEXT:    vsrl.vx v14, v8, a4
-; RV64-NEXT:    vand.vx v14, v14, a3
-; RV64-NEXT:    vor.vv v12, v14, v12
-; RV64-NEXT:    vsrl.vi v14, v8, 24
-; RV64-NEXT:    vand.vx v14, v14, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v12, v8, a2
+; RV64-NEXT:    vsll.vi v12, v12, 8
+; RV64-NEXT:    vor.vv v12, v16, v12
+; RV64-NEXT:    vsll.vx v16, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v14
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vor.vv v8, v10, v8
+; RV64-NEXT:    vor.vv v10, v10, v14
+; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vp_bswap_nxv2i64_unmasked:
@@ -858,34 +858,34 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vx v12, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v16, v16, a3, v0.t
-; RV32-NEXT:    vor.vv v16, v12, v16, v0.t
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v20, v8, a1, v0.t
+; RV32-NEXT:    vsetvli a3, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v20, v20, a4, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
+; RV32-NEXT:    vand.vx v20, v8, a5, v0.t
 ; RV32-NEXT:    vsll.vi v20, v20, 24, v0.t
 ; RV32-NEXT:    vand.vv v24, v8, v12, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV32-NEXT:    vor.vv v20, v20, v24, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vsrl.vx v20, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v20, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
@@ -898,31 +898,31 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 ; RV64-LABEL: vp_bswap_nxv4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v12, v12, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; RV64-NEXT:    vor.vv v12, v12, v16, v0.t
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v20, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v20, v20, a4, v0.t
+; RV64-NEXT:    vsll.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v20, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v20, v20, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v20, v0.t
 ; RV64-NEXT:    vor.vv v12, v16, v12, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v20, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v20, v20, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v20, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v20, v20, a0, v0.t
 ; RV64-NEXT:    vor.vv v16, v20, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v20, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v20, v20, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v20, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
@@ -943,39 +943,39 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vx v12, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v16, v16, a3
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsll.vx v16, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v20, v8, a2
+; RV32-NEXT:    vsrl.vx v24, v8, a4
+; RV32-NEXT:    vand.vx v28, v8, a1
+; RV32-NEXT:    vand.vx v24, v24, a1
+; RV32-NEXT:    vor.vv v20, v24, v20
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v8, a4
-; RV32-NEXT:    vsll.vi v20, v20, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsll.vx v28, v28, a4
+; RV32-NEXT:    vor.vv v16, v16, v28
+; RV32-NEXT:    vsrl.vi v28, v8, 8
+; RV32-NEXT:    vand.vx v12, v12, a5
+; RV32-NEXT:    vand.vv v28, v28, v24
+; RV32-NEXT:    vor.vv v12, v28, v12
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v20, v20, v24
-; RV32-NEXT:    vor.vv v12, v12, v20
-; RV32-NEXT:    vsrl.vx v20, v8, a1
-; RV32-NEXT:    vsrl.vx v24, v8, a3
-; RV32-NEXT:    vand.vx v24, v24, a2
-; RV32-NEXT:    vor.vv v20, v24, v20
-; RV32-NEXT:    vsrl.vi v24, v8, 24
-; RV32-NEXT:    vand.vx v24, v24, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vor.vv v8, v8, v20
-; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v12, v12, v20
+; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
@@ -983,34 +983,34 @@ define <vscale x 4 x i64> @vp_bswap_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ; RV64-LABEL: vp_bswap_nxv4i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vand.vx v12, v8, a1
-; RV64-NEXT:    vsll.vi v12, v12, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v16, v8, a0
-; RV64-NEXT:    vsll.vi v16, v16, 8
-; RV64-NEXT:    vor.vv v12, v12, v16
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v16, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v20, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v20, v20, a4
-; RV64-NEXT:    vor.vv v16, v16, v20
+; RV64-NEXT:    vsrl.vi v12, v8, 24
+; RV64-NEXT:    vsrl.vi v16, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v20, v8, a3
+; RV64-NEXT:    vsrl.vx v24, v8, a5
+; RV64-NEXT:    vand.vx v24, v24, a0
+; RV64-NEXT:    vor.vv v20, v24, v20
+; RV64-NEXT:    vand.vx v24, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v12, v12, a1
+; RV64-NEXT:    vsll.vi v24, v24, 24
+; RV64-NEXT:    vand.vx v16, v16, a2
 ; RV64-NEXT:    vor.vv v12, v16, v12
-; RV64-NEXT:    vsrl.vx v16, v8, a2
-; RV64-NEXT:    vsrl.vx v20, v8, a4
-; RV64-NEXT:    vand.vx v20, v20, a3
-; RV64-NEXT:    vor.vv v16, v20, v16
-; RV64-NEXT:    vsrl.vi v20, v8, 24
-; RV64-NEXT:    vand.vx v20, v20, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vsll.vx v24, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v20
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v24, v8
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vor.vv v8, v12, v8
+; RV64-NEXT:    vor.vv v12, v12, v20
+; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vp_bswap_nxv4i64_unmasked:
@@ -1035,33 +1035,33 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
@@ -1080,14 +1080,14 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
@@ -1123,36 +1123,35 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV64-NEXT:    addi a5, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
@@ -1181,51 +1180,59 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsll.vx v24, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    vsrl.vx v0, v8, a4
+; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a1
+; RV32-NEXT:    vsll.vx v0, v0, a4
+; RV32-NEXT:    vor.vv v16, v24, v0
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v0, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v0, v8, a4
-; RV32-NEXT:    vsll.vi v0, v0, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a5
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v0, v8, a3
-; RV32-NEXT:    vand.vx v0, v0, a2
-; RV32-NEXT:    vsrl.vx v24, v8, a1
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1234,35 +1241,51 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_bswap_nxv7i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0
-; RV64-NEXT:    vsll.vi v24, v24, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v0, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v0, v0, a4
-; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vx v24, v8, a2
-; RV64-NEXT:    vsrl.vx v0, v8, a4
-; RV64-NEXT:    vand.vx v0, v0, a3
+; RV64-NEXT:    vsrl.vi v24, v8, 24
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v16, v8, a3
+; RV64-NEXT:    vsrl.vx v0, v8, a5
+; RV64-NEXT:    vand.vx v0, v0, a0
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v0, v8, 8
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v0, v0, a2
 ; RV64-NEXT:    vor.vv v24, v0, v24
-; RV64-NEXT:    vsrl.vi v0, v8, 24
-; RV64-NEXT:    vand.vx v0, v0, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v0, v8, a1
+; RV64-NEXT:    vsll.vi v0, v0, 24
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    vsll.vx v0, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v0
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v0, v8
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vp_bswap_nxv7i64_unmasked:
@@ -1287,33 +1310,33 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
@@ -1332,14 +1355,14 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
@@ -1375,36 +1398,35 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV64-NEXT:    addi a5, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
@@ -1433,51 +1455,59 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsll.vx v24, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    vsrl.vx v0, v8, a4
+; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a1
+; RV32-NEXT:    vsll.vx v0, v0, a4
+; RV32-NEXT:    vor.vv v16, v24, v0
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v0, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v0, v8, a4
-; RV32-NEXT:    vsll.vi v0, v0, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a5
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v0, v8, a3
-; RV32-NEXT:    vand.vx v0, v0, a2
-; RV32-NEXT:    vsrl.vx v24, v8, a1
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1486,35 +1516,51 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_bswap_nxv8i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0
-; RV64-NEXT:    vsll.vi v24, v24, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v0, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v0, v0, a4
-; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vx v24, v8, a2
-; RV64-NEXT:    vsrl.vx v0, v8, a4
-; RV64-NEXT:    vand.vx v0, v0, a3
+; RV64-NEXT:    vsrl.vi v24, v8, 24
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v16, v8, a3
+; RV64-NEXT:    vsrl.vx v0, v8, a5
+; RV64-NEXT:    vand.vx v0, v0, a0
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v0, v8, 8
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v0, v0, a2
 ; RV64-NEXT:    vor.vv v24, v0, v24
-; RV64-NEXT:    vsrl.vi v0, v8, 24
-; RV64-NEXT:    vand.vx v0, v0, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v0, v8, a1
+; RV64-NEXT:    vsll.vi v0, v0, 24
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    vsll.vx v0, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v0
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v0, v8
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vp_bswap_nxv8i64_unmasked:
@@ -1546,9 +1592,9 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 1
+; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
@@ -1588,9 +1634,9 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x
 ; CHECK-ZVKB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVKB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVKB-NEXT:    srli a2, a1, 1
+; CHECK-ZVKB-NEXT:    slli a1, a1, 2
 ; CHECK-ZVKB-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-ZVKB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVKB-NEXT:    slli a1, a1, 2
 ; CHECK-ZVKB-NEXT:    sub a2, a0, a1
 ; CHECK-ZVKB-NEXT:    sltu a3, a0, a2
 ; CHECK-ZVKB-NEXT:    addi a3, a3, -1
@@ -1661,38 +1707,38 @@ define <vscale x 1 x i48> @vp_bswap_nxv1i48(<vscale x 1 x i48> %va, <vscale x 1
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v9, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v10, v10, a3, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vsll.vx v9, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v10, v8, a1, v0.t
+; RV32-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v11, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v11, v11, 24, v0.t
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
+; RV32-NEXT:    vsll.vx v10, v10, a4, v0.t
+; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vand.vx v10, v8, a5, v0.t
+; RV32-NEXT:    vsll.vi v10, v10, 24, v0.t
+; RV32-NEXT:    vand.vv v12, v8, v11, v0.t
 ; RV32-NEXT:    vsll.vi v12, v12, 8, v0.t
-; RV32-NEXT:    vor.vv v11, v11, v12, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v11, v0.t
-; RV32-NEXT:    vsrl.vx v11, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v12, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a2, v0.t
-; RV32-NEXT:    vor.vv v11, v12, v11, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsrl.vx v10, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v12, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV32-NEXT:    vor.vv v10, v12, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a4, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v11, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 16, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1702,31 +1748,31 @@ define <vscale x 1 x i48> @vp_bswap_nxv1i48(<vscale x 1 x i48> %va, <vscale x 1
 ; RV64-LABEL: vp_bswap_nxv1i48:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v9, v9, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v10, v10, 8, v0.t
 ; RV64-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v10, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v11, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v11, v11, a4, v0.t
+; RV64-NEXT:    vsll.vx v10, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v11, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v11, v11, a5, v0.t
 ; RV64-NEXT:    vor.vv v10, v10, v11, v0.t
 ; RV64-NEXT:    vor.vv v9, v10, v9, v0.t
-; RV64-NEXT:    vsrl.vx v10, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v11, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v11, v11, a3, v0.t
+; RV64-NEXT:    vsrl.vx v10, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v11, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v11, v11, a0, v0.t
 ; RV64-NEXT:    vor.vv v10, v11, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v11, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v11, v11, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v11, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
index 41ff1133ff823c..a4e5ab661c5285 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll
@@ -70,24 +70,24 @@ define fastcc <vscale x 64 x i32> @ret_split_nxv64i32(ptr %x) {
 ; CHECK-LABEL: ret_split_nxv64i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vl8re32.v v8, (a1)
 ; CHECK-NEXT:    slli a3, a2, 3
 ; CHECK-NEXT:    slli a4, a2, 5
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    sub a4, a4, a3
-; CHECK-NEXT:    add a5, a1, a4
-; CHECK-NEXT:    vl8re32.v v8, (a5)
+; CHECK-NEXT:    add a5, a1, a2
+; CHECK-NEXT:    vl8re32.v v16, (a5)
 ; CHECK-NEXT:    add a5, a1, a3
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    vl8re32.v v16, (a1)
-; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    vl8re32.v v24, (a1)
-; CHECK-NEXT:    vl8re32.v v0, (a5)
-; CHECK-NEXT:    vs8r.v v16, (a0)
 ; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vs8r.v v24, (a2)
 ; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    vs8r.v v0, (a3)
-; CHECK-NEXT:    add a0, a0, a4
+; CHECK-NEXT:    add a1, a1, a4
+; CHECK-NEXT:    vl8re32.v v24, (a5)
+; CHECK-NEXT:    vl8re32.v v0, (a1)
 ; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vs8r.v v16, (a2)
+; CHECK-NEXT:    vs8r.v v24, (a3)
+; CHECK-NEXT:    add a0, a0, a4
+; CHECK-NEXT:    vs8r.v v0, (a0)
 ; CHECK-NEXT:    ret
   %v = load <vscale x 64 x i32>, ptr %x
   ret <vscale x 64 x i32> %v
@@ -100,87 +100,99 @@ define fastcc <vscale x 128 x i32> @ret_split_nxv128i32(ptr %x) {
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
+; CHECK-NEXT:    li a3, 40
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a3, a2, 3
-; CHECK-NEXT:    slli a4, a2, 5
-; CHECK-NEXT:    sub a5, a4, a3
-; CHECK-NEXT:    add a6, a1, a5
-; CHECK-NEXT:    vl8re32.v v8, (a6)
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    li a7, 24
-; CHECK-NEXT:    mul a6, a6, a7
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; CHECK-NEXT:    li a3, 40
+; CHECK-NEXT:    vl8re32.v v8, (a1)
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    slli a4, a4, 5
+; CHECK-NEXT:    add a4, sp, a4
+; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    slli a4, a2, 3
+; CHECK-NEXT:    slli a5, a2, 5
 ; CHECK-NEXT:    slli a6, a2, 4
 ; CHECK-NEXT:    slli a7, a2, 6
+; CHECK-NEXT:    mul a2, a2, a3
+; CHECK-NEXT:    sub a3, a5, a4
 ; CHECK-NEXT:    sub t0, a7, a6
-; CHECK-NEXT:    add t1, a1, t0
+; CHECK-NEXT:    sub a7, a7, a4
+; CHECK-NEXT:    add t1, a1, a4
+; CHECK-NEXT:    add t2, a1, a6
+; CHECK-NEXT:    add t3, a1, a5
 ; CHECK-NEXT:    vl8re32.v v8, (t1)
 ; CHECK-NEXT:    csrr t1, vlenb
-; CHECK-NEXT:    slli t1, t1, 4
+; CHECK-NEXT:    li t4, 24
+; CHECK-NEXT:    mul t1, t1, t4
 ; CHECK-NEXT:    add t1, sp, t1
 ; CHECK-NEXT:    addi t1, t1, 16
 ; CHECK-NEXT:    vs8r.v v8, (t1) # Unknown-size Folded Spill
-; CHECK-NEXT:    sub a7, a7, a3
-; CHECK-NEXT:    add t1, a1, a7
+; CHECK-NEXT:    add t1, a1, a2
+; CHECK-NEXT:    vl8re32.v v8, (t2)
+; CHECK-NEXT:    csrr t2, vlenb
+; CHECK-NEXT:    slli t2, t2, 3
+; CHECK-NEXT:    add t2, sp, t2
+; CHECK-NEXT:    addi t2, t2, 16
+; CHECK-NEXT:    vs8r.v v8, (t2) # Unknown-size Folded Spill
+; CHECK-NEXT:    add t2, a1, a3
+; CHECK-NEXT:    vl8re32.v v16, (t3)
+; CHECK-NEXT:    add t3, a1, t0
+; CHECK-NEXT:    add a1, a1, a7
 ; CHECK-NEXT:    vl8re32.v v8, (t1)
+; CHECK-NEXT:    vl8re32.v v24, (t2)
 ; CHECK-NEXT:    csrr t1, vlenb
-; CHECK-NEXT:    slli t1, t1, 3
+; CHECK-NEXT:    slli t1, t1, 4
 ; CHECK-NEXT:    add t1, sp, t1
 ; CHECK-NEXT:    addi t1, t1, 16
-; CHECK-NEXT:    vs8r.v v8, (t1) # Unknown-size Folded Spill
-; CHECK-NEXT:    add t1, a1, a3
-; CHECK-NEXT:    vl8re32.v v8, (t1)
+; CHECK-NEXT:    vs8r.v v24, (t1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re32.v v24, (t3)
 ; CHECK-NEXT:    addi t1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (t1) # Unknown-size Folded Spill
-; CHECK-NEXT:    add t1, a1, a6
-; CHECK-NEXT:    add t2, a1, a4
-; CHECK-NEXT:    li t3, 40
-; CHECK-NEXT:    mul a2, a2, t3
-; CHECK-NEXT:    add t3, a1, a2
-; CHECK-NEXT:    vl8re32.v v8, (a1)
-; CHECK-NEXT:    vl8re32.v v0, (t1)
-; CHECK-NEXT:    vl8re32.v v16, (t3)
-; CHECK-NEXT:    vl8re32.v v24, (t2)
-; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vs8r.v v24, (t1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re32.v v24, (a1)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v0, (a0)
 ; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vs8r.v v16, (a2)
-; CHECK-NEXT:    add a4, a0, a4
-; CHECK-NEXT:    vs8r.v v24, (a4)
+; CHECK-NEXT:    vs8r.v v8, (a2)
+; CHECK-NEXT:    add a5, a0, a5
+; CHECK-NEXT:    vs8r.v v16, (a5)
 ; CHECK-NEXT:    add a6, a0, a6
-; CHECK-NEXT:    vs8r.v v0, (a6)
-; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vs8r.v v8, (a3)
-; CHECK-NEXT:    add a7, a0, a7
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vs8r.v v8, (a7)
-; CHECK-NEXT:    add t0, a0, t0
+; CHECK-NEXT:    vs8r.v v8, (a6)
+; CHECK-NEXT:    add a4, a0, a4
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v8, (a4)
+; CHECK-NEXT:    add a7, a0, a7
+; CHECK-NEXT:    vs8r.v v24, (a7)
+; CHECK-NEXT:    add t0, a0, t0
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vs8r.v v8, (t0)
-; CHECK-NEXT:    add a0, a0, a5
+; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vs8r.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    li a1, 40
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -236,40 +248,52 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a3, a2, a1
-; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    vl8re32.v v0, (a1)
-; CHECK-NEXT:    vl8re32.v v16, (a3)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    vl8re32.v v8, (a2)
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re32.v v0, (a0)
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a2, a2, a1
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl8re32.v v8, (a0)
 ; CHECK-NEXT:    vl8re32.v v16, (a2)
-; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vadd.vv v24, v8, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vadd.vv v0, v8, v0
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vadd.vv v0, v24, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vadd.vv v24, v0, v24
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vadd.vv v8, v0, v8
-; CHECK-NEXT:    vadd.vv v24, v24, v16
+; CHECK-NEXT:    vadd.vv v8, v8, v16
 ; CHECK-NEXT:    vadd.vx v16, v8, a4
 ; CHECK-NEXT:    vadd.vx v8, v24, a4
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -306,10 +330,10 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_i32(<vsca
 ; RV32-NEXT:    add a3, a0, a1
 ; RV32-NEXT:    vl8re32.v v24, (a3)
 ; RV32-NEXT:    vl8re32.v v0, (a0)
+; RV32-NEXT:    addi a3, sp, 128
 ; RV32-NEXT:    addi a0, sp, 128
-; RV32-NEXT:    vs8r.v v8, (a0)
-; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    addi a0, sp, 128
+; RV32-NEXT:    vs8r.v v8, (a3)
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    li a3, 2
 ; RV32-NEXT:    vs8r.v v16, (a1)
 ; RV32-NEXT:    vmv8r.v v8, v0
@@ -344,10 +368,10 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_i32(<vsca
 ; RV64-NEXT:    add a3, a0, a1
 ; RV64-NEXT:    vl8re32.v v24, (a3)
 ; RV64-NEXT:    vl8re32.v v0, (a0)
+; RV64-NEXT:    addi a3, sp, 128
 ; RV64-NEXT:    addi a0, sp, 128
-; RV64-NEXT:    vs8r.v v8, (a0)
-; RV64-NEXT:    add a1, a0, a1
-; RV64-NEXT:    addi a0, sp, 128
+; RV64-NEXT:    vs8r.v v8, (a3)
+; RV64-NEXT:    add a1, a3, a1
 ; RV64-NEXT:    li a3, 2
 ; RV64-NEXT:    vs8r.v v16, (a1)
 ; RV64-NEXT:    vmv8r.v v8, v0
@@ -382,34 +406,35 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    andi sp, sp, -128
+; RV32-NEXT:    addi a1, sp, 128
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a3, a2, a1
-; RV32-NEXT:    vl8re32.v v24, (a3)
+; RV32-NEXT:    vl8re32.v v16, (a2)
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 128
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a2, a2, a1
 ; RV32-NEXT:    add a3, a0, a1
-; RV32-NEXT:    vl8re32.v v24, (a3)
-; RV32-NEXT:    addi a3, sp, 128
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vl8re32.v v0, (a2)
-; RV32-NEXT:    vl8re32.v v24, (a0)
+; RV32-NEXT:    vl8re32.v v24, (a3)
+; RV32-NEXT:    vl8re32.v v16, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vs8r.v v8, (a0)
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 128
-; RV32-NEXT:    vs8r.v v24, (a2)
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 128
+; RV32-NEXT:    vs8r.v v16, (a3)
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    vs8r.v v16, (a0)
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    addi a2, sp, 128
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
@@ -418,16 +443,15 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_
 ; RV32-NEXT:    slli a2, a2, 4
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 128
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    li a5, 42
-; RV32-NEXT:    addi a3, sp, 128
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vs8r.v v8, (a1)
-; RV32-NEXT:    vmv8r.v v8, v0
+; RV32-NEXT:    vs8r.v v24, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 128
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv8r.v v16, v0
 ; RV32-NEXT:    call ext3
 ; RV32-NEXT:    addi sp, s0, -144
 ; RV32-NEXT:    .cfi_def_cfa sp, 144
@@ -454,34 +478,35 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    andi sp, sp, -128
+; RV64-NEXT:    addi a1, sp, 128
+; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a3, a2, a1
-; RV64-NEXT:    vl8re32.v v24, (a3)
+; RV64-NEXT:    vl8re32.v v16, (a2)
 ; RV64-NEXT:    csrr a3, vlenb
 ; RV64-NEXT:    slli a3, a3, 3
 ; RV64-NEXT:    add a3, sp, a3
 ; RV64-NEXT:    addi a3, a3, 128
-; RV64-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a2, a2, a1
 ; RV64-NEXT:    add a3, a0, a1
-; RV64-NEXT:    vl8re32.v v24, (a3)
-; RV64-NEXT:    addi a3, sp, 128
-; RV64-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    vl8re32.v v0, (a2)
-; RV64-NEXT:    vl8re32.v v24, (a0)
+; RV64-NEXT:    vl8re32.v v24, (a3)
+; RV64-NEXT:    vl8re32.v v16, (a0)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 4
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 128
 ; RV64-NEXT:    vs8r.v v8, (a0)
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 5
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 128
-; RV64-NEXT:    vs8r.v v24, (a2)
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 5
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 128
+; RV64-NEXT:    vs8r.v v16, (a3)
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vs8r.v v16, (a0)
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    addi a2, sp, 128
+; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 5
 ; RV64-NEXT:    add a0, sp, a0
@@ -490,16 +515,15 @@ define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_
 ; RV64-NEXT:    slli a2, a2, 4
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 128
+; RV64-NEXT:    add a1, a3, a1
 ; RV64-NEXT:    li a5, 42
-; RV64-NEXT:    addi a3, sp, 128
-; RV64-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV64-NEXT:    vs8r.v v8, (a1)
-; RV64-NEXT:    vmv8r.v v8, v0
+; RV64-NEXT:    vs8r.v v24, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 128
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv8r.v v16, v0
 ; RV64-NEXT:    call ext3
 ; RV64-NEXT:    addi sp, s0, -144
 ; RV64-NEXT:    .cfi_def_cfa sp, 144
@@ -557,29 +581,29 @@ define fastcc <vscale x 32 x i32> @pass_vector_arg_indirect_stack(<vscale x 32 x
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    addi a1, s1, 128
-; RV32-NEXT:    vs8r.v v8, (a1)
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add a2, s1, a2
-; RV32-NEXT:    addi a2, a2, 128
-; RV32-NEXT:    vs8r.v v8, (a2)
-; RV32-NEXT:    li a3, 8
-; RV32-NEXT:    sw a3, 0(sp)
-; RV32-NEXT:    add a1, a1, a0
-; RV32-NEXT:    vs8r.v v8, (a1)
-; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    addi t0, s1, 128
+; RV32-NEXT:    csrr t1, vlenb
+; RV32-NEXT:    slli t1, t1, 4
+; RV32-NEXT:    add t1, s1, t1
+; RV32-NEXT:    addi t1, t1, 128
+; RV32-NEXT:    li a7, 8
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    li a2, 2
 ; RV32-NEXT:    li a3, 3
 ; RV32-NEXT:    li a4, 4
 ; RV32-NEXT:    li a5, 5
 ; RV32-NEXT:    li a6, 6
+; RV32-NEXT:    vs8r.v v8, (t0)
+; RV32-NEXT:    vs8r.v v8, (t1)
+; RV32-NEXT:    sw a7, 0(sp)
 ; RV32-NEXT:    li a7, 7
+; RV32-NEXT:    add t0, t0, a0
+; RV32-NEXT:    add a0, t1, a0
 ; RV32-NEXT:    csrr t3, vlenb
 ; RV32-NEXT:    slli t3, t3, 4
 ; RV32-NEXT:    add t3, s1, t3
 ; RV32-NEXT:    addi t3, t3, 128
+; RV32-NEXT:    vs8r.v v8, (t0)
 ; RV32-NEXT:    addi t5, s1, 128
 ; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    li a0, 0
@@ -620,29 +644,29 @@ define fastcc <vscale x 32 x i32> @pass_vector_arg_indirect_stack(<vscale x 32 x
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    addi a1, s1, 128
-; RV64-NEXT:    vs8r.v v8, (a1)
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 4
-; RV64-NEXT:    add a2, s1, a2
-; RV64-NEXT:    addi a2, a2, 128
-; RV64-NEXT:    vs8r.v v8, (a2)
-; RV64-NEXT:    li a3, 8
-; RV64-NEXT:    sd a3, 0(sp)
-; RV64-NEXT:    add a1, a1, a0
-; RV64-NEXT:    vs8r.v v8, (a1)
-; RV64-NEXT:    add a0, a2, a0
+; RV64-NEXT:    addi t0, s1, 128
+; RV64-NEXT:    csrr t1, vlenb
+; RV64-NEXT:    slli t1, t1, 4
+; RV64-NEXT:    add t1, s1, t1
+; RV64-NEXT:    addi t1, t1, 128
+; RV64-NEXT:    li a7, 8
 ; RV64-NEXT:    li a1, 1
 ; RV64-NEXT:    li a2, 2
 ; RV64-NEXT:    li a3, 3
 ; RV64-NEXT:    li a4, 4
 ; RV64-NEXT:    li a5, 5
 ; RV64-NEXT:    li a6, 6
+; RV64-NEXT:    vs8r.v v8, (t0)
+; RV64-NEXT:    vs8r.v v8, (t1)
+; RV64-NEXT:    sd a7, 0(sp)
 ; RV64-NEXT:    li a7, 7
+; RV64-NEXT:    add t0, t0, a0
+; RV64-NEXT:    add a0, t1, a0
 ; RV64-NEXT:    csrr t3, vlenb
 ; RV64-NEXT:    slli t3, t3, 4
 ; RV64-NEXT:    add t3, s1, t3
 ; RV64-NEXT:    addi t3, t3, 128
+; RV64-NEXT:    vs8r.v v8, (t0)
 ; RV64-NEXT:    addi t5, s1, 128
 ; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    li a0, 0
@@ -708,7 +732,6 @@ define fastcc <vscale x 16 x i32> @pass_vector_arg_indirect_stack_no_gpr(<vscale
 ; RV32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    addi a0, s1, 64
-; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    li a2, 2
 ; RV32-NEXT:    li a3, 3
@@ -720,6 +743,7 @@ define fastcc <vscale x 16 x i32> @pass_vector_arg_indirect_stack_no_gpr(<vscale
 ; RV32-NEXT:    li t4, 9
 ; RV32-NEXT:    li t5, 10
 ; RV32-NEXT:    li t6, 11
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    sw a0, 0(sp)
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    vmv.v.i v16, 0
@@ -758,7 +782,6 @@ define fastcc <vscale x 16 x i32> @pass_vector_arg_indirect_stack_no_gpr(<vscale
 ; RV64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; RV64-NEXT:    vmv.v.i v8, 0
 ; RV64-NEXT:    addi a0, s1, 64
-; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    li a1, 1
 ; RV64-NEXT:    li a2, 2
 ; RV64-NEXT:    li a3, 3
@@ -770,6 +793,7 @@ define fastcc <vscale x 16 x i32> @pass_vector_arg_indirect_stack_no_gpr(<vscale
 ; RV64-NEXT:    li t4, 9
 ; RV64-NEXT:    li t5, 10
 ; RV64-NEXT:    li t6, 11
+; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    sd a0, 0(sp)
 ; RV64-NEXT:    li a0, 0
 ; RV64-NEXT:    vmv.v.i v16, 0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
index 277d8c9d55eaf2..9b27116fef7cae 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
@@ -36,12 +36,12 @@ define <vscale x 32 x i32> @caller_scalable_vector_split_indirect(<vscale x 32 x
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    andi sp, sp, -128
 ; RV32-NEXT:    addi a0, sp, 128
-; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    addi a0, sp, 128
 ; RV32-NEXT:    vs8r.v v16, (a1)
 ; RV32-NEXT:    vmv.v.i v16, 0
@@ -71,12 +71,12 @@ define <vscale x 32 x i32> @caller_scalable_vector_split_indirect(<vscale x 32 x
 ; RV64-NEXT:    sub sp, sp, a0
 ; RV64-NEXT:    andi sp, sp, -128
 ; RV64-NEXT:    addi a0, sp, 128
-; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV64-NEXT:    vmv.v.i v8, 0
 ; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, a0, a1
-; RV64-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
-; RV64-NEXT:    vmv.v.i v8, 0
 ; RV64-NEXT:    addi a0, sp, 128
 ; RV64-NEXT:    vs8r.v v16, (a1)
 ; RV64-NEXT:    vmv.v.i v16, 0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
index 2c05f27460bcdb..7d0b0118a72725 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
@@ -19,10 +19,10 @@ define <vscale x 1 x bfloat> @vp_ceil_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vs
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -44,10 +44,10 @@ define <vscale x 1 x bfloat> @vp_ceil_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -69,10 +69,10 @@ define <vscale x 2 x bfloat> @vp_ceil_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vs
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -94,10 +94,10 @@ define <vscale x 2 x bfloat> @vp_ceil_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -120,10 +120,10 @@ define <vscale x 4 x bfloat> @vp_ceil_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vs
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -146,10 +146,10 @@ define <vscale x 4 x bfloat> @vp_ceil_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -172,10 +172,10 @@ define <vscale x 8 x bfloat> @vp_ceil_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vs
 ; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -198,10 +198,10 @@ define <vscale x 8 x bfloat> @vp_ceil_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -224,10 +224,10 @@ define <vscale x 16 x bfloat> @vp_ceil_vv_nxv16bf16(<vscale x 16 x bfloat> %va,
 ; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -250,10 +250,10 @@ define <vscale x 16 x bfloat> @vp_ceil_vv_nxv16bf16_unmasked(<vscale x 16 x bflo
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -279,62 +279,54 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va,
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    fmv.w.x fa5, a3
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v17, v0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vmv1r.v v0, v17
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24, v0.t
-; CHECK-NEXT:    lui a2, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a2
+; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 3
-; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    vmv1r.v v8, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -354,51 +346,41 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va,
 define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv32bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v16
+; CHECK-NEXT:    fmv.w.x fa5, a3
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v16, v16, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24, v0.t
-; CHECK-NEXT:    lui a2, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a2
+; CHECK-NEXT:    vslidedown.vx v12, v24, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 3
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
@@ -410,12 +392,6 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16_unmasked(<vscale x 32 x bflo
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.ceil.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -444,10 +420,10 @@ define <vscale x 1 x half> @vp_ceil_vv_nxv1f16(<vscale x 1 x half> %va, <vscale
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -484,10 +460,10 @@ define <vscale x 1 x half> @vp_ceil_vv_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -526,10 +502,10 @@ define <vscale x 2 x half> @vp_ceil_vv_nxv2f16(<vscale x 2 x half> %va, <vscale
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -566,10 +542,10 @@ define <vscale x 2 x half> @vp_ceil_vv_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -609,10 +585,10 @@ define <vscale x 4 x half> @vp_ceil_vv_nxv4f16(<vscale x 4 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -650,10 +626,10 @@ define <vscale x 4 x half> @vp_ceil_vv_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -673,9 +649,9 @@ declare <vscale x 8 x half> @llvm.vp.ceil.nxv8f16(<vscale x 8 x half>, <vscale x
 define <vscale x 8 x half> @vp_ceil_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_vv_nxv8f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI18_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI18_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -695,10 +671,10 @@ define <vscale x 8 x half> @vp_ceil_vv_nxv8f16(<vscale x 8 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -736,10 +712,10 @@ define <vscale x 8 x half> @vp_ceil_vv_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -759,9 +735,9 @@ declare <vscale x 16 x half> @llvm.vp.ceil.nxv16f16(<vscale x 16 x half>, <vscal
 define <vscale x 16 x half> @vp_ceil_vv_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_vv_nxv16f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI20_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI20_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
@@ -781,10 +757,10 @@ define <vscale x 16 x half> @vp_ceil_vv_nxv16f16(<vscale x 16 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -822,10 +798,10 @@ define <vscale x 16 x half> @vp_ceil_vv_nxv16f16_unmasked(<vscale x 16 x half> %
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -845,9 +821,9 @@ declare <vscale x 32 x half> @llvm.vp.ceil.nxv32f16(<vscale x 32 x half>, <vscal
 define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_vv_nxv32f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI22_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI22_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
@@ -870,62 +846,54 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vmv1r.v v16, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    lui a2, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v12, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v8, v16
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -960,51 +928,41 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ;
 ; ZVFHMIN-LABEL: vp_ceil_vv_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v16
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    lui a2, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 3
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
@@ -1016,12 +974,6 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.ceil.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
@@ -1290,9 +1242,9 @@ declare <vscale x 2 x double> @llvm.vp.ceil.nxv2f64(<vscale x 2 x double>, <vsca
 define <vscale x 2 x double> @vp_ceil_vv_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv2f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI36_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI36_0)(a1)
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -1334,9 +1286,9 @@ declare <vscale x 4 x double> @llvm.vp.ceil.nxv4f64(<vscale x 4 x double>, <vsca
 define <vscale x 4 x double> @vp_ceil_vv_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv4f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI38_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI38_0)(a1)
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
@@ -1378,9 +1330,9 @@ declare <vscale x 7 x double> @llvm.vp.ceil.nxv7f64(<vscale x 7 x double>, <vsca
 define <vscale x 7 x double> @vp_ceil_vv_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv7f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI40_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI40_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -1422,9 +1374,9 @@ declare <vscale x 8 x double> @llvm.vp.ceil.nxv8f64(<vscale x 8 x double>, <vsca
 define <vscale x 8 x double> @vp_ceil_vv_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv8f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI42_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI42_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -1475,12 +1427,12 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v6, v0, a2
+; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
+; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    lui a3, %hi(.LCPI44_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a3)
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v6, v0, a3
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
@@ -1501,23 +1453,26 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -1533,12 +1488,12 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64_unmasked(<vscale x 16 x doubl
 ; CHECK-LABEL: vp_ceil_vv_nxv16f64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    lui a3, %hi(.LCPI45_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI45_0)(a3)
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    lui a2, %hi(.LCPI45_0)
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5

diff  --git a/llvm/test/CodeGen/RISCV/rvv/compressstore.ll b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
index ee4be0cf865a2d..400dfd393509c0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
@@ -197,88 +197,82 @@ entry:
 define void @test_compresstore_v256i8(ptr %p, <256 x i1> %mask, <256 x i8> %data) {
 ; RV64-LABEL: test_compresstore_v256i8:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 4
-; RV64-NEXT:    sub sp, sp, a2
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv1r.v v7, v8
 ; RV64-NEXT:    li a2, 128
-; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV64-NEXT:    vle8.v v16, (a1)
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v9, v0, 1
-; RV64-NEXT:    vmv.x.s a1, v9
 ; RV64-NEXT:    vmv.x.s a3, v0
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 16
-; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV64-NEXT:    vcompress.vm v16, v24, v0
+; RV64-NEXT:    vle8.v v24, (a1)
+; RV64-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV64-NEXT:    vmv.x.s a1, v9
+; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT:    vcompress.vm v8, v16, v0
 ; RV64-NEXT:    vcpop.m a4, v0
 ; RV64-NEXT:    vsetvli zero, a4, e8, m8, ta, ma
-; RV64-NEXT:    vse8.v v16, (a0)
-; RV64-NEXT:    addi a4, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV64-NEXT:    vcompress.vm v16, v24, v8
-; RV64-NEXT:    vcpop.m a2, v8
+; RV64-NEXT:    vcpop.m a2, v7
 ; RV64-NEXT:    cpop a3, a3
 ; RV64-NEXT:    cpop a1, a1
 ; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    vcompress.vm v8, v24, v7
 ; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV64-NEXT:    vse8.v v16, (a0)
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    .cfi_def_cfa sp, 16
-; RV64-NEXT:    addi sp, sp, 16
-; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: test_compresstore_v256i8:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    vmv1r.v v7, v8
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    sub sp, sp, a2
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    vmv8r.v v24, v16
 ; RV32-NEXT:    li a2, 128
-; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV32-NEXT:    vle8.v v24, (a1)
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v9, v0, 1
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v10, v9, a1
-; RV32-NEXT:    vmv.x.s a3, v10
-; RV32-NEXT:    vsrl.vx v10, v0, a1
-; RV32-NEXT:    vmv.x.s a1, v10
-; RV32-NEXT:    vmv.x.s a4, v9
-; RV32-NEXT:    vmv.x.s a5, v0
+; RV32-NEXT:    li a3, 32
+; RV32-NEXT:    vmv.x.s a4, v0
+; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV32-NEXT:    vle8.v v16, (a1)
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vx v10, v9, a3
+; RV32-NEXT:    vmv.x.s a1, v9
+; RV32-NEXT:    vsrl.vx v9, v0, a3
 ; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV32-NEXT:    vcompress.vm v8, v16, v0
-; RV32-NEXT:    vcpop.m a6, v0
-; RV32-NEXT:    vsetvli zero, a6, e8, m8, ta, ma
-; RV32-NEXT:    vse8.v v8, (a0)
+; RV32-NEXT:    vcompress.vm v16, v24, v0
+; RV32-NEXT:    vcpop.m a3, v0
+; RV32-NEXT:    cpop a4, a4
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vmv.x.s a5, v10
+; RV32-NEXT:    vmv.x.s a6, v9
+; RV32-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; RV32-NEXT:    vse8.v v16, (a0)
 ; RV32-NEXT:    cpop a1, a1
+; RV32-NEXT:    cpop a3, a6
 ; RV32-NEXT:    cpop a5, a5
-; RV32-NEXT:    add a1, a5, a1
-; RV32-NEXT:    cpop a3, a3
-; RV32-NEXT:    cpop a4, a4
 ; RV32-NEXT:    add a3, a4, a3
-; RV32-NEXT:    add a1, a1, a3
-; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a1, a1, a5
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV32-NEXT:    vcompress.vm v8, v24, v7
-; RV32-NEXT:    vcpop.m a1, v7
+; RV32-NEXT:    vcompress.vm v16, v24, v8
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    vcpop.m a1, v8
 ; RV32-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; RV32-NEXT:    vse8.v v8, (a0)
+; RV32-NEXT:    vse8.v v16, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    .cfi_def_cfa sp, 16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 entry:
   tail call void @llvm.masked.compressstore.v256i8(<256 x i8> %data, ptr align 1 %p, <256 x i1> %mask)
@@ -463,43 +457,45 @@ define void @test_compresstore_v128i16(ptr %p, <128 x i1> %mask, <128 x i16> %da
 ; RV64-NEXT:    vse16.v v24, (a0)
 ; RV64-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v0, 8
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.x.s a2, v0
 ; RV64-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; RV64-NEXT:    vcompress.vm v24, v16, v8
-; RV64-NEXT:    vcpop.m a2, v8
-; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; RV64-NEXT:    vmv.x.s a1, v0
-; RV64-NEXT:    cpop a1, a1
-; RV64-NEXT:    slli a1, a1, 1
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; RV64-NEXT:    vcpop.m a1, v8
+; RV64-NEXT:    cpop a2, a2
+; RV64-NEXT:    slli a2, a2, 1
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; RV64-NEXT:    vse16.v v24, (a0)
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: test_compresstore_v128i16:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    li a1, 64
-; RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; RV32-NEXT:    vcompress.vm v24, v8, v0
-; RV32-NEXT:    vcpop.m a2, v0
-; RV32-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; RV32-NEXT:    vse16.v v24, (a0)
 ; RV32-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v0, 8
+; RV32-NEXT:    vslidedown.vi v7, v0, 8
+; RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; RV32-NEXT:    vcompress.vm v24, v16, v7
+; RV32-NEXT:    vcpop.m a2, v7
+; RV32-NEXT:    li a3, 32
+; RV32-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV32-NEXT:    vmv.x.s a4, v0
 ; RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; RV32-NEXT:    vcompress.vm v8, v16, v24
-; RV32-NEXT:    vcpop.m a1, v24
-; RV32-NEXT:    li a2, 32
+; RV32-NEXT:    vcompress.vm v16, v8, v0
+; RV32-NEXT:    vcpop.m a1, v0
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v16, v0, a2
-; RV32-NEXT:    vmv.x.s a2, v16
-; RV32-NEXT:    cpop a2, a2
-; RV32-NEXT:    vmv.x.s a3, v0
-; RV32-NEXT:    cpop a3, a3
-; RV32-NEXT:    add a2, a3, a2
-; RV32-NEXT:    slli a2, a2, 1
-; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vsrl.vx v8, v0, a3
 ; RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; RV32-NEXT:    vse16.v v8, (a0)
+; RV32-NEXT:    vse16.v v16, (a0)
+; RV32-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    cpop a1, a1
+; RV32-NEXT:    cpop a3, a4
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; RV32-NEXT:    vse16.v v24, (a0)
 ; RV32-NEXT:    ret
 entry:
   tail call void @llvm.masked.compressstore.v128i16(<128 x i16> %data, ptr align 2 %p, <128 x i1> %mask)
@@ -659,10 +655,11 @@ define void @test_compresstore_v64i32(ptr %p, <64 x i1> %mask, <64 x i32> %data)
 ; RV64-NEXT:    vse32.v v24, (a0)
 ; RV64-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v0, 4
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmv.x.s a2, v0
 ; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV64-NEXT:    vcompress.vm v24, v16, v8
 ; RV64-NEXT:    vcpop.m a1, v8
-; RV64-NEXT:    vmv.x.s a2, v0
 ; RV64-NEXT:    cpopw a2, a2
 ; RV64-NEXT:    slli a2, a2, 2
 ; RV64-NEXT:    add a0, a0, a2
@@ -680,10 +677,11 @@ define void @test_compresstore_v64i32(ptr %p, <64 x i1> %mask, <64 x i32> %data)
 ; RV32-NEXT:    vse32.v v24, (a0)
 ; RV32-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v0, 4
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.x.s a2, v0
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vcompress.vm v24, v16, v8
 ; RV32-NEXT:    vcpop.m a1, v8
-; RV32-NEXT:    vmv.x.s a2, v0
 ; RV32-NEXT:    cpop a2, a2
 ; RV32-NEXT:    slli a2, a2, 2
 ; RV32-NEXT:    add a0, a0, a2
@@ -822,10 +820,10 @@ define void @test_compresstore_v32i64(ptr %p, <32 x i1> %mask, <32 x i64> %data)
 ; RV64-NEXT:    vse64.v v24, (a0)
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v0, 2
+; RV64-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV64-NEXT:    vmv.x.s a1, v0
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vcompress.vm v24, v16, v8
-; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT:    vmv.x.s a1, v0
 ; RV64-NEXT:    zext.h a1, a1
 ; RV64-NEXT:    cpopw a1, a1
 ; RV64-NEXT:    slli a1, a1, 3
@@ -844,10 +842,10 @@ define void @test_compresstore_v32i64(ptr %p, <32 x i1> %mask, <32 x i64> %data)
 ; RV32-NEXT:    vse64.v v24, (a0)
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v0, 2
+; RV32-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32-NEXT:    vmv.x.s a1, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vcompress.vm v24, v16, v8
-; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    vmv.x.s a1, v0
 ; RV32-NEXT:    zext.h a1, a1
 ; RV32-NEXT:    cpop a1, a1
 ; RV32-NEXT:    slli a1, a1, 3

diff  --git a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
index 7839b602706db1..ad176df71397e6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll
@@ -18,50 +18,52 @@
 define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lanes.b, <4 x i1> %sel) {
 ; RV32-LABEL: constant_folding_crash:
 ; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    vmv1r.v v10, v0
 ; RV32-NEXT:    lw a0, 8(a0)
 ; RV32-NEXT:    andi a0, a0, 1
 ; RV32-NEXT:    seqz a0, a0
 ; RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vmsne.vi v10, v10, 0
-; RV32-NEXT:    vmv1r.v v11, v0
-; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vmv.v.x v11, a0
+; RV32-NEXT:    vmsne.vi v0, v11, 0
 ; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmerge.vvm v8, v9, v8, v0
-; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    vmv1r.v v0, v11
-; RV32-NEXT:    vmerge.vim v8, v8, 1, v0
-; RV32-NEXT:    vrgather.vi v9, v8, 0
-; RV32-NEXT:    vmsne.vi v0, v9, 0
+; RV32-NEXT:    vmv.v.i v9, 0
+; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.i v11, 10
+; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vmerge.vim v9, v9, 1, v0
+; RV32-NEXT:    vrgather.vi v10, v9, 0
+; RV32-NEXT:    vmsne.vi v0, v10, 0
 ; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v8, 10
-; RV32-NEXT:    vse32.v v8, (a0), v0.t
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vse32.v v11, (a0), v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: constant_folding_crash:
 ; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    vmv1r.v v12, v0
 ; RV64-NEXT:    ld a0, 8(a0)
 ; RV64-NEXT:    andi a0, a0, 1
 ; RV64-NEXT:    seqz a0, a0
 ; RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; RV64-NEXT:    vmv.v.x v12, a0
-; RV64-NEXT:    vmsne.vi v12, v12, 0
-; RV64-NEXT:    vmv1r.v v13, v0
-; RV64-NEXT:    vmv1r.v v0, v12
+; RV64-NEXT:    vmv.v.x v13, a0
+; RV64-NEXT:    vmsne.vi v0, v13, 0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; RV64-NEXT:    vmerge.vvm v8, v10, v8, v0
-; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmv1r.v v0, v13
-; RV64-NEXT:    vmerge.vim v8, v8, 1, v0
-; RV64-NEXT:    vrgather.vi v9, v8, 0
-; RV64-NEXT:    vmsne.vi v0, v9, 0
+; RV64-NEXT:    vmv.v.i v9, 0
 ; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.i v8, 10
-; RV64-NEXT:    vse32.v v8, (a0), v0.t
+; RV64-NEXT:    vmv.v.i v10, 10
+; RV64-NEXT:    vmv1r.v v0, v12
+; RV64-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vmerge.vim v9, v9, 1, v0
+; RV64-NEXT:    vrgather.vi v11, v9, 0
+; RV64-NEXT:    vmsne.vi v0, v11, 0
+; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vse32.v v10, (a0), v0.t
 ; RV64-NEXT:    ret
 entry:
   %sunkaddr = getelementptr i8, ptr %v54, i64 8

diff  --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
index d51f5eacd7d91a..208735b18cbab5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll
@@ -13,6 +13,7 @@ define <vscale x 1 x i8> @ctlz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
@@ -20,10 +21,9 @@ define <vscale x 1 x i8> @ctlz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -37,11 +37,11 @@ define <vscale x 1 x i8> @ctlz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v9
 ; CHECK-F-NEXT:    vnsrl.wi v8, v8, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 8
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -51,11 +51,11 @@ define <vscale x 1 x i8> @ctlz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v9, v8
+; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v9
 ; CHECK-D-NEXT:    vnsrl.wi v8, v8, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 8
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -76,6 +76,7 @@ define <vscale x 2 x i8> @ctlz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
@@ -83,10 +84,9 @@ define <vscale x 2 x i8> @ctlz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -100,11 +100,11 @@ define <vscale x 2 x i8> @ctlz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v9
 ; CHECK-F-NEXT:    vnsrl.wi v8, v8, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 8
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -114,11 +114,11 @@ define <vscale x 2 x i8> @ctlz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v9, v8
+; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v9
 ; CHECK-D-NEXT:    vnsrl.wi v8, v8, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 8
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -139,6 +139,7 @@ define <vscale x 4 x i8> @ctlz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
@@ -146,10 +147,9 @@ define <vscale x 4 x i8> @ctlz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -163,11 +163,11 @@ define <vscale x 4 x i8> @ctlz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v9, v8
+; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
 ; CHECK-F-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 8
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -177,11 +177,11 @@ define <vscale x 4 x i8> @ctlz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v9, v8
+; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v9
 ; CHECK-D-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 8
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -202,6 +202,7 @@ define <vscale x 8 x i8> @ctlz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
@@ -209,10 +210,9 @@ define <vscale x 8 x i8> @ctlz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -226,11 +226,11 @@ define <vscale x 8 x i8> @ctlz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v10, v8
+; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v10
 ; CHECK-F-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v10, v8, 0
-; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vrsub.vx v8, v10, a0
 ; CHECK-F-NEXT:    li a0, 8
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -240,11 +240,11 @@ define <vscale x 8 x i8> @ctlz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v10, v8
+; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v10
 ; CHECK-D-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v10, v8, 0
-; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vrsub.vx v8, v10, a0
 ; CHECK-D-NEXT:    li a0, 8
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -265,6 +265,7 @@ define <vscale x 16 x i8> @ctlz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
@@ -272,10 +273,9 @@ define <vscale x 16 x i8> @ctlz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -289,11 +289,11 @@ define <vscale x 16 x i8> @ctlz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v12, v8
+; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v12
 ; CHECK-F-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v12, v8, 0
-; CHECK-F-NEXT:    li a0, 134
 ; CHECK-F-NEXT:    vrsub.vx v8, v12, a0
 ; CHECK-F-NEXT:    li a0, 8
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -303,11 +303,11 @@ define <vscale x 16 x i8> @ctlz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v12, v8
+; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v16, v12
 ; CHECK-D-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v12, v8, 0
-; CHECK-D-NEXT:    li a0, 134
 ; CHECK-D-NEXT:    vrsub.vx v8, v12, a0
 ; CHECK-D-NEXT:    li a0, 8
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -328,6 +328,7 @@ define <vscale x 32 x i8> @ctlz_nxv32i8(<vscale x 32 x i8> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v12
@@ -335,10 +336,9 @@ define <vscale x 32 x i8> @ctlz_nxv32i8(<vscale x 32 x i8> %va) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -363,6 +363,7 @@ define <vscale x 64 x i8> @ctlz_nxv64i8(<vscale x 64 x i8> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v16
@@ -370,10 +371,9 @@ define <vscale x 64 x i8> @ctlz_nxv64i8(<vscale x 64 x i8> %va) {
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -398,7 +398,9 @@ define <vscale x 1 x i16> @ctlz_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -407,20 +409,18 @@ define <vscale x 1 x i16> @ctlz_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -431,8 +431,8 @@ define <vscale x 1 x i16> @ctlz_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v9, v8
-; CHECK-F-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -442,8 +442,8 @@ define <vscale x 1 x i16> @ctlz_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v9, v8
-; CHECK-D-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-D-NEXT:    li a0, 142
+; CHECK-D-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -464,7 +464,9 @@ define <vscale x 2 x i16> @ctlz_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -473,20 +475,18 @@ define <vscale x 2 x i16> @ctlz_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -497,8 +497,8 @@ define <vscale x 2 x i16> @ctlz_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v9, v8
-; CHECK-F-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -508,8 +508,8 @@ define <vscale x 2 x i16> @ctlz_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v9, v8
-; CHECK-D-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-D-NEXT:    li a0, 142
+; CHECK-D-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -530,7 +530,9 @@ define <vscale x 4 x i16> @ctlz_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -539,20 +541,18 @@ define <vscale x 4 x i16> @ctlz_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -563,8 +563,8 @@ define <vscale x 4 x i16> @ctlz_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-F-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -574,8 +574,8 @@ define <vscale x 4 x i16> @ctlz_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-D-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-D-NEXT:    li a0, 142
+; CHECK-D-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -596,7 +596,9 @@ define <vscale x 8 x i16> @ctlz_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
@@ -605,20 +607,18 @@ define <vscale x 8 x i16> @ctlz_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -629,8 +629,8 @@ define <vscale x 8 x i16> @ctlz_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-F-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -640,8 +640,8 @@ define <vscale x 8 x i16> @ctlz_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-D-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-D-NEXT:    li a0, 142
+; CHECK-D-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -662,7 +662,9 @@ define <vscale x 16 x i16> @ctlz_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
@@ -671,20 +673,18 @@ define <vscale x 16 x i16> @ctlz_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v12, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -695,8 +695,8 @@ define <vscale x 16 x i16> @ctlz_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-F-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-F-NEXT:    li a0, 142
+; CHECK-F-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a0
@@ -706,8 +706,8 @@ define <vscale x 16 x i16> @ctlz_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-D-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-D-NEXT:    li a0, 142
+; CHECK-D-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a0
@@ -728,7 +728,9 @@ define <vscale x 32 x i16> @ctlz_nxv32i16(<vscale x 32 x i16> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
@@ -737,20 +739,18 @@ define <vscale x 32 x i16> @ctlz_nxv32i16(<vscale x 32 x i16> %va) {
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -772,7 +772,9 @@ define <vscale x 1 x i32> @ctlz_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -783,20 +785,18 @@ define <vscale x 1 x i32> @ctlz_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -809,8 +809,8 @@ define <vscale x 1 x i32> @ctlz_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a1
@@ -844,7 +844,9 @@ define <vscale x 2 x i32> @ctlz_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -855,20 +857,18 @@ define <vscale x 2 x i32> @ctlz_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -881,8 +881,8 @@ define <vscale x 2 x i32> @ctlz_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a1
@@ -916,7 +916,9 @@ define <vscale x 4 x i32> @ctlz_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
@@ -927,20 +929,18 @@ define <vscale x 4 x i32> @ctlz_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -953,8 +953,8 @@ define <vscale x 4 x i32> @ctlz_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a1
@@ -988,7 +988,9 @@ define <vscale x 8 x i32> @ctlz_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
@@ -999,20 +1001,18 @@ define <vscale x 8 x i32> @ctlz_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v12, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -1025,8 +1025,8 @@ define <vscale x 8 x i32> @ctlz_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a1
@@ -1060,7 +1060,9 @@ define <vscale x 16 x i32> @ctlz_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v16
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 4
@@ -1071,20 +1073,18 @@ define <vscale x 16 x i32> @ctlz_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v16, v16, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    vand.vx v16, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v16
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -1097,8 +1097,8 @@ define <vscale x 16 x i32> @ctlz_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-F-NEXT:    fsrmi a0, 1
 ; CHECK-F-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 158
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
 ; CHECK-F-NEXT:    vminu.vx v8, v8, a1
@@ -1110,8 +1110,8 @@ define <vscale x 16 x i32> @ctlz_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-D-NEXT:    fsrmi a0, 1
 ; CHECK-D-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-D-NEXT:    li a1, 158
+; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-D-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-D-NEXT:    li a1, 32
 ; CHECK-D-NEXT:    vminu.vx v8, v8, a1
@@ -1133,6 +1133,12 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32I-NEXT:    vsrl.vi v9, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32I-NEXT:    vmv.v.x v10, a0
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32I-NEXT:    vor.vv v8, v8, v9
 ; RV32I-NEXT:    vsrl.vi v9, v8, 2
 ; RV32I-NEXT:    vor.vv v8, v8, v9
@@ -1142,40 +1148,34 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v9
 ; RV32I-NEXT:    vsrl.vi v9, v8, 16
 ; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    li a0, 32
 ; RV32I-NEXT:    vsrl.vx v9, v8, a0
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vor.vv v8, v8, v9
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v9, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -1185,6 +1185,23 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    lui a3, 4112
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    addiw a3, a3, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    li a4, 32
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 2
 ; RV64I-NEXT:    vor.vv v8, v8, v9
@@ -1194,37 +1211,20 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v9
-; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:    vsrl.vx v9, v8, a0
+; RV64I-NEXT:    vsrl.vx v9, v8, a4
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v9, v9, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v9, v8, a0
+; RV64I-NEXT:    vand.vx v9, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1232,16 +1232,16 @@ define <vscale x 1 x i64> @ctlz_nxv1i64(<vscale x 1 x i64> %va) {
 ; CHECK-F-LABEL: ctlz_nxv1i64:
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-F-NEXT:    vmv.v.x v9, a0
-; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v10, 23
-; CHECK-F-NEXT:    vwsubu.vv v10, v9, v8
-; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    fsrmi a1, 1
+; CHECK-F-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v9, v8
+; CHECK-F-NEXT:    vmv.v.x v8, a0
+; CHECK-F-NEXT:    vsrl.vi v9, v9, 23
+; CHECK-F-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-F-NEXT:    vminu.vx v8, v10, a1
-; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vminu.vx v8, v10, a0
+; CHECK-F-NEXT:    fsrm a1
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_nxv1i64:
@@ -1273,6 +1273,12 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32I-NEXT:    vmv.v.x v12, a0
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32I-NEXT:    vor.vv v8, v8, v10
 ; RV32I-NEXT:    vsrl.vi v10, v8, 2
 ; RV32I-NEXT:    vor.vv v8, v8, v10
@@ -1282,40 +1288,34 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v10
 ; RV32I-NEXT:    vsrl.vi v10, v8, 16
 ; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    li a0, 32
 ; RV32I-NEXT:    vsrl.vx v10, v8, a0
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vor.vv v8, v8, v10
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v10, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
-; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -1325,6 +1325,23 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    lui a3, 4112
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    addiw a3, a3, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    li a4, 32
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 2
 ; RV64I-NEXT:    vor.vv v8, v8, v10
@@ -1334,37 +1351,20 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v10
-; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:    vsrl.vx v10, v8, a0
+; RV64I-NEXT:    vsrl.vx v10, v8, a4
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v10, v10, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v10, v8, a0
+; RV64I-NEXT:    vand.vx v10, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1372,16 +1372,16 @@ define <vscale x 2 x i64> @ctlz_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-F-LABEL: ctlz_nxv2i64:
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-F-NEXT:    vmv.v.x v10, a0
-; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v11, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v11, 23
-; CHECK-F-NEXT:    vwsubu.vv v12, v10, v8
-; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    fsrmi a1, 1
+; CHECK-F-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
+; CHECK-F-NEXT:    vmv.v.x v8, a0
+; CHECK-F-NEXT:    vsrl.vi v9, v10, 23
+; CHECK-F-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-F-NEXT:    vminu.vx v8, v12, a1
-; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vminu.vx v8, v10, a0
+; CHECK-F-NEXT:    fsrm a1
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_nxv2i64:
@@ -1413,6 +1413,12 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32I-NEXT:    vmv.v.x v16, a0
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32I-NEXT:    vor.vv v8, v8, v12
 ; RV32I-NEXT:    vsrl.vi v12, v8, 2
 ; RV32I-NEXT:    vor.vv v8, v8, v12
@@ -1422,40 +1428,34 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v12
 ; RV32I-NEXT:    vsrl.vi v12, v8, 16
 ; RV32I-NEXT:    vor.vv v8, v8, v12
-; RV32I-NEXT:    li a0, 32
 ; RV32I-NEXT:    vsrl.vx v12, v8, a0
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vor.vv v8, v8, v12
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v12, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32I-NEXT:    vand.vv v12, v12, v16
-; RV32I-NEXT:    vsub.vv v8, v8, v12
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v16, v8, v12
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    vadd.vv v8, v16, v8
-; RV32I-NEXT:    vsrl.vi v12, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v12
+; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v12
+; RV32I-NEXT:    vand.vv v12, v8, v16
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v12, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v12
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    vmul.vv v8, v8, v12
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -1465,6 +1465,23 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV64I-NEXT:    vsrl.vi v12, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    lui a3, 4112
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    addiw a3, a3, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    li a4, 32
 ; RV64I-NEXT:    vor.vv v8, v8, v12
 ; RV64I-NEXT:    vsrl.vi v12, v8, 2
 ; RV64I-NEXT:    vor.vv v8, v8, v12
@@ -1474,37 +1491,20 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v12
 ; RV64I-NEXT:    vsrl.vi v12, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v12
-; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:    vsrl.vx v12, v8, a0
+; RV64I-NEXT:    vsrl.vx v12, v8, a4
 ; RV64I-NEXT:    vor.vv v8, v8, v12
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v12, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v12, v12, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v12, v8, a0
+; RV64I-NEXT:    vand.vx v12, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v12, v8
 ; RV64I-NEXT:    vsrl.vi v12, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1512,16 +1512,16 @@ define <vscale x 4 x i64> @ctlz_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-F-LABEL: ctlz_nxv4i64:
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-F-NEXT:    vmv.v.x v12, a0
-; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v14, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v14, 23
-; CHECK-F-NEXT:    vwsubu.vv v16, v12, v8
-; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    fsrmi a1, 1
+; CHECK-F-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v8
+; CHECK-F-NEXT:    vmv.v.x v8, a0
+; CHECK-F-NEXT:    vsrl.vi v10, v12, 23
+; CHECK-F-NEXT:    vwsubu.vv v12, v8, v10
+; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-F-NEXT:    vminu.vx v8, v16, a1
-; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vminu.vx v8, v12, a0
+; CHECK-F-NEXT:    fsrm a1
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_nxv4i64:
@@ -1553,6 +1553,12 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32I-NEXT:    vmv.v.x v24, a0
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32I-NEXT:    vor.vv v8, v8, v16
 ; RV32I-NEXT:    vsrl.vi v16, v8, 2
 ; RV32I-NEXT:    vor.vv v8, v8, v16
@@ -1562,41 +1568,35 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v16
 ; RV32I-NEXT:    vsrl.vi v16, v8, 16
 ; RV32I-NEXT:    vor.vv v8, v8, v16
-; RV32I-NEXT:    li a0, 32
 ; RV32I-NEXT:    vsrl.vx v16, v8, a0
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vor.vv v8, v8, v16
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v16, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v24, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v16, v16, v24
-; RV32I-NEXT:    vsub.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vand.vv v24, v16, v24
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v24
 ; RV32I-NEXT:    vand.vv v24, v8, v16
 ; RV32I-NEXT:    vsrl.vi v8, v8, 2
 ; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    vadd.vv v8, v24, v8
-; RV32I-NEXT:    vsrl.vi v16, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 61681
-; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v24, v8
+; RV32I-NEXT:    vsrl.vi v24, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v24
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
+; RV32I-NEXT:    vmv.v.x v24, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vmul.vv v8, v8, v16
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vmul.vv v8, v8, v24
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
 ; RV32I-NEXT:    ret
@@ -1605,6 +1605,23 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    lui a3, 4112
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    addiw a3, a3, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    li a4, 32
 ; RV64I-NEXT:    vor.vv v8, v8, v16
 ; RV64I-NEXT:    vsrl.vi v16, v8, 2
 ; RV64I-NEXT:    vor.vv v8, v8, v16
@@ -1614,37 +1631,20 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v16
 ; RV64I-NEXT:    vsrl.vi v16, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v16
-; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:    vsrl.vx v16, v8, a0
+; RV64I-NEXT:    vsrl.vx v16, v8, a4
 ; RV64I-NEXT:    vor.vv v8, v8, v16
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v16, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v16, v16, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v16, v8, a0
+; RV64I-NEXT:    vand.vx v16, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v16, v8
 ; RV64I-NEXT:    vsrl.vi v16, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1652,16 +1652,16 @@ define <vscale x 8 x i64> @ctlz_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-F-LABEL: ctlz_nxv8i64:
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    li a0, 190
-; CHECK-F-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-F-NEXT:    vmv.v.x v16, a0
-; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfncvt.f.xu.w v20, v8
-; CHECK-F-NEXT:    vsrl.vi v8, v20, 23
-; CHECK-F-NEXT:    vwsubu.vv v24, v16, v8
-; CHECK-F-NEXT:    li a1, 64
+; CHECK-F-NEXT:    fsrmi a1, 1
+; CHECK-F-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v8
+; CHECK-F-NEXT:    vmv.v.x v8, a0
+; CHECK-F-NEXT:    vsrl.vi v12, v16, 23
+; CHECK-F-NEXT:    vwsubu.vv v16, v8, v12
+; CHECK-F-NEXT:    li a0, 64
 ; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-F-NEXT:    vminu.vx v8, v24, a1
-; CHECK-F-NEXT:    fsrm a0
+; CHECK-F-NEXT:    vminu.vx v8, v16, a0
+; CHECK-F-NEXT:    fsrm a1
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: ctlz_nxv8i64:
@@ -1693,6 +1693,7 @@ define <vscale x 1 x i8> @ctlz_zero_undef_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
@@ -1700,10 +1701,9 @@ define <vscale x 1 x i8> @ctlz_zero_undef_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1751,6 +1751,7 @@ define <vscale x 2 x i8> @ctlz_zero_undef_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
@@ -1758,10 +1759,9 @@ define <vscale x 2 x i8> @ctlz_zero_undef_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1809,6 +1809,7 @@ define <vscale x 4 x i8> @ctlz_zero_undef_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
@@ -1816,10 +1817,9 @@ define <vscale x 4 x i8> @ctlz_zero_undef_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1867,6 +1867,7 @@ define <vscale x 8 x i8> @ctlz_zero_undef_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
@@ -1874,10 +1875,9 @@ define <vscale x 8 x i8> @ctlz_zero_undef_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1925,6 +1925,7 @@ define <vscale x 16 x i8> @ctlz_zero_undef_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
@@ -1932,10 +1933,9 @@ define <vscale x 16 x i8> @ctlz_zero_undef_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-ZVE64X-NEXT:    li a0, 85
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1983,6 +1983,7 @@ define <vscale x 32 x i8> @ctlz_zero_undef_nxv32i8(<vscale x 32 x i8> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v12
@@ -1990,10 +1991,9 @@ define <vscale x 32 x i8> @ctlz_zero_undef_nxv32i8(<vscale x 32 x i8> %va) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2017,6 +2017,7 @@ define <vscale x 64 x i8> @ctlz_zero_undef_nxv64i8(<vscale x 64 x i8> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v16
@@ -2024,10 +2025,9 @@ define <vscale x 64 x i8> @ctlz_zero_undef_nxv64i8(<vscale x 64 x i8> %va) {
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2051,7 +2051,9 @@ define <vscale x 1 x i16> @ctlz_zero_undef_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -2060,20 +2062,18 @@ define <vscale x 1 x i16> @ctlz_zero_undef_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2112,7 +2112,9 @@ define <vscale x 2 x i16> @ctlz_zero_undef_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -2121,20 +2123,18 @@ define <vscale x 2 x i16> @ctlz_zero_undef_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2173,7 +2173,9 @@ define <vscale x 4 x i16> @ctlz_zero_undef_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -2182,20 +2184,18 @@ define <vscale x 4 x i16> @ctlz_zero_undef_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2234,7 +2234,9 @@ define <vscale x 8 x i16> @ctlz_zero_undef_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
@@ -2243,20 +2245,18 @@ define <vscale x 8 x i16> @ctlz_zero_undef_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2295,7 +2295,9 @@ define <vscale x 16 x i16> @ctlz_zero_undef_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
@@ -2304,20 +2306,18 @@ define <vscale x 16 x i16> @ctlz_zero_undef_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 5
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v12, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2356,7 +2356,9 @@ define <vscale x 32 x i16> @ctlz_zero_undef_nxv32i16(<vscale x 32 x i16> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
@@ -2365,20 +2367,18 @@ define <vscale x 32 x i16> @ctlz_zero_undef_nxv32i16(<vscale x 32 x i16> %va) {
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -2399,7 +2399,9 @@ define <vscale x 1 x i32> @ctlz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -2410,20 +2412,18 @@ define <vscale x 1 x i32> @ctlz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2466,7 +2466,9 @@ define <vscale x 2 x i32> @ctlz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
@@ -2477,20 +2479,18 @@ define <vscale x 2 x i32> @ctlz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2533,7 +2533,9 @@ define <vscale x 4 x i32> @ctlz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
@@ -2544,20 +2546,18 @@ define <vscale x 4 x i32> @ctlz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2600,7 +2600,9 @@ define <vscale x 8 x i32> @ctlz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
@@ -2611,20 +2613,18 @@ define <vscale x 8 x i32> @ctlz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v12, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2667,7 +2667,9 @@ define <vscale x 16 x i32> @ctlz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v16
+; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 4
@@ -2678,20 +2680,18 @@ define <vscale x 16 x i32> @ctlz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-ZVE64X-NEXT:    vor.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-ZVE64X-NEXT:    lui a0, 349525
-; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
 ; CHECK-ZVE64X-NEXT:    vand.vx v16, v16, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    vand.vx v16, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v16
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2735,6 +2735,12 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32I-NEXT:    vsrl.vi v9, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32I-NEXT:    vmv.v.x v10, a0
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32I-NEXT:    vor.vv v8, v8, v9
 ; RV32I-NEXT:    vsrl.vi v9, v8, 2
 ; RV32I-NEXT:    vor.vv v8, v8, v9
@@ -2744,40 +2750,34 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v9
 ; RV32I-NEXT:    vsrl.vi v9, v8, 16
 ; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    li a0, 32
 ; RV32I-NEXT:    vsrl.vx v9, v8, a0
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vor.vv v8, v8, v9
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v9, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -2787,6 +2787,23 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    lui a3, 4112
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    addiw a3, a3, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    li a4, 32
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 2
 ; RV64I-NEXT:    vor.vv v8, v8, v9
@@ -2796,37 +2813,20 @@ define <vscale x 1 x i64> @ctlz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v9
-; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:    vsrl.vx v9, v8, a0
+; RV64I-NEXT:    vsrl.vx v9, v8, a4
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v9, v9, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v9, v8, a0
+; RV64I-NEXT:    vand.vx v9, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -2869,6 +2869,12 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32I-NEXT:    vmv.v.x v12, a0
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32I-NEXT:    vor.vv v8, v8, v10
 ; RV32I-NEXT:    vsrl.vi v10, v8, 2
 ; RV32I-NEXT:    vor.vv v8, v8, v10
@@ -2878,40 +2884,34 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v10
 ; RV32I-NEXT:    vsrl.vi v10, v8, 16
 ; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    li a0, 32
 ; RV32I-NEXT:    vsrl.vx v10, v8, a0
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vor.vv v8, v8, v10
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v10, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -2921,6 +2921,23 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    lui a3, 4112
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    addiw a3, a3, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    li a4, 32
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 2
 ; RV64I-NEXT:    vor.vv v8, v8, v10
@@ -2930,37 +2947,20 @@ define <vscale x 2 x i64> @ctlz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v10
-; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:    vsrl.vx v10, v8, a0
+; RV64I-NEXT:    vsrl.vx v10, v8, a4
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v10, v10, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v10, v8, a0
+; RV64I-NEXT:    vand.vx v10, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -3003,6 +3003,12 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32I-NEXT:    vmv.v.x v16, a0
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32I-NEXT:    vor.vv v8, v8, v12
 ; RV32I-NEXT:    vsrl.vi v12, v8, 2
 ; RV32I-NEXT:    vor.vv v8, v8, v12
@@ -3012,40 +3018,34 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v12
 ; RV32I-NEXT:    vsrl.vi v12, v8, 16
 ; RV32I-NEXT:    vor.vv v8, v8, v12
-; RV32I-NEXT:    li a0, 32
 ; RV32I-NEXT:    vsrl.vx v12, v8, a0
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vor.vv v8, v8, v12
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v12, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32I-NEXT:    vand.vv v12, v12, v16
-; RV32I-NEXT:    vsub.vv v8, v8, v12
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v16, v8, v12
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    vadd.vv v8, v16, v8
-; RV32I-NEXT:    vsrl.vi v12, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v12
+; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v12
+; RV32I-NEXT:    vand.vv v12, v8, v16
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v12, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v12
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    vmul.vv v8, v8, v12
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -3055,6 +3055,23 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV64I-NEXT:    vsrl.vi v12, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    lui a3, 4112
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    addiw a3, a3, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    li a4, 32
 ; RV64I-NEXT:    vor.vv v8, v8, v12
 ; RV64I-NEXT:    vsrl.vi v12, v8, 2
 ; RV64I-NEXT:    vor.vv v8, v8, v12
@@ -3064,37 +3081,20 @@ define <vscale x 4 x i64> @ctlz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v12
 ; RV64I-NEXT:    vsrl.vi v12, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v12
-; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:    vsrl.vx v12, v8, a0
+; RV64I-NEXT:    vsrl.vx v12, v8, a4
 ; RV64I-NEXT:    vor.vv v8, v8, v12
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v12, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v12, v12, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v12, v8, a0
+; RV64I-NEXT:    vand.vx v12, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v12, v8
 ; RV64I-NEXT:    vsrl.vi v12, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -3137,6 +3137,12 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    addi a0, a0, 1365
+; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32I-NEXT:    vmv.v.x v24, a0
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32I-NEXT:    vor.vv v8, v8, v16
 ; RV32I-NEXT:    vsrl.vi v16, v8, 2
 ; RV32I-NEXT:    vor.vv v8, v8, v16
@@ -3146,41 +3152,35 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32I-NEXT:    vor.vv v8, v8, v16
 ; RV32I-NEXT:    vsrl.vi v16, v8, 16
 ; RV32I-NEXT:    vor.vv v8, v8, v16
-; RV32I-NEXT:    li a0, 32
 ; RV32I-NEXT:    vsrl.vx v16, v8, a0
+; RV32I-NEXT:    lui a0, 209715
+; RV32I-NEXT:    addi a0, a0, 819
 ; RV32I-NEXT:    vor.vv v8, v8, v16
 ; RV32I-NEXT:    vnot.v v8, v8
 ; RV32I-NEXT:    vsrl.vi v16, v8, 1
-; RV32I-NEXT:    lui a0, 349525
-; RV32I-NEXT:    addi a0, a0, 1365
-; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v24, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v16, v16, v24
-; RV32I-NEXT:    vsub.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 209715
-; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vand.vv v24, v16, v24
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v24
 ; RV32I-NEXT:    vand.vv v24, v8, v16
 ; RV32I-NEXT:    vsrl.vi v8, v8, 2
 ; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    vadd.vv v8, v24, v8
-; RV32I-NEXT:    vsrl.vi v16, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 61681
-; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v24, v8
+; RV32I-NEXT:    vsrl.vi v24, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v24
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
+; RV32I-NEXT:    vmv.v.x v24, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vmul.vv v8, v8, v16
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vmul.vv v8, v8, v24
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
 ; RV32I-NEXT:    ret
@@ -3189,6 +3189,23 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV64I-NEXT:    vsrl.vi v16, v8, 1
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    lui a2, 61681
+; RV64I-NEXT:    lui a3, 4112
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    addiw a2, a2, -241
+; RV64I-NEXT:    addiw a3, a3, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    li a4, 32
 ; RV64I-NEXT:    vor.vv v8, v8, v16
 ; RV64I-NEXT:    vsrl.vi v16, v8, 2
 ; RV64I-NEXT:    vor.vv v8, v8, v16
@@ -3198,37 +3215,20 @@ define <vscale x 8 x i64> @ctlz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64I-NEXT:    vor.vv v8, v8, v16
 ; RV64I-NEXT:    vsrl.vi v16, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v16
-; RV64I-NEXT:    li a0, 32
-; RV64I-NEXT:    vsrl.vx v16, v8, a0
+; RV64I-NEXT:    vsrl.vx v16, v8, a4
 ; RV64I-NEXT:    vor.vv v8, v8, v16
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v16, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v16, v16, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v16, v8, a0
+; RV64I-NEXT:    vand.vx v16, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v16, v8
 ; RV64I-NEXT:    vsrl.vi v16, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index 2c9f633b890143..f56a792fdef6a8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -15,6 +15,7 @@ define <vscale x 1 x i8> @vp_ctlz_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
@@ -22,7 +23,6 @@ define <vscale x 1 x i8> @vp_ctlz_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -42,11 +42,11 @@ define <vscale x 1 x i8> @vp_ctlz_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zer
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -68,6 +68,7 @@ define <vscale x 2 x i8> @vp_ctlz_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
@@ -75,7 +76,6 @@ define <vscale x 2 x i8> @vp_ctlz_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -95,11 +95,11 @@ define <vscale x 2 x i8> @vp_ctlz_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zer
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -121,6 +121,7 @@ define <vscale x 4 x i8> @vp_ctlz_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v10, 23, v0.t
@@ -128,7 +129,6 @@ define <vscale x 4 x i8> @vp_ctlz_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v10, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -148,11 +148,11 @@ define <vscale x 4 x i8> @vp_ctlz_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zer
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9
 ; CHECK-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -174,6 +174,7 @@ define <vscale x 8 x i8> @vp_ctlz_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v12, 23, v0.t
@@ -181,7 +182,6 @@ define <vscale x 8 x i8> @vp_ctlz_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v12, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -201,11 +201,11 @@ define <vscale x 8 x i8> @vp_ctlz_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zer
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10
 ; CHECK-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v10, a0
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -227,6 +227,7 @@ define <vscale x 16 x i8> @vp_ctlz_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v12, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v16, 23, v0.t
@@ -234,7 +235,6 @@ define <vscale x 16 x i8> @vp_ctlz_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -254,11 +254,11 @@ define <vscale x 16 x i8> @vp_ctlz_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v12, v8
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12
 ; CHECK-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v12, a0
 ; CHECK-NEXT:    li a0, 8
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -280,6 +280,7 @@ define <vscale x 32 x i8> @vp_ctlz_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
@@ -287,10 +288,9 @@ define <vscale x 32 x i8> @vp_ctlz_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -314,6 +314,7 @@ define <vscale x 32 x i8> @vp_ctlz_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v12
@@ -321,10 +322,9 @@ define <vscale x 32 x i8> @vp_ctlz_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -350,6 +350,7 @@ define <vscale x 64 x i8> @vp_ctlz_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -357,10 +358,9 @@ define <vscale x 64 x i8> @vp_ctlz_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -384,6 +384,7 @@ define <vscale x 64 x i8> @vp_ctlz_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v16
@@ -391,10 +392,9 @@ define <vscale x 64 x i8> @vp_ctlz_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -420,11 +420,11 @@ define <vscale x 1 x i16> @vp_ctlz_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -444,8 +444,8 @@ define <vscale x 1 x i16> @vp_ctlz_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
-; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -467,11 +467,11 @@ define <vscale x 2 x i16> @vp_ctlz_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -491,8 +491,8 @@ define <vscale x 2 x i16> @vp_ctlz_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
-; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vnsrl.wi v8, v9, 23
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -514,11 +514,11 @@ define <vscale x 4 x i16> @vp_ctlz_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v10, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -538,8 +538,8 @@ define <vscale x 4 x i16> @vp_ctlz_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vnsrl.wi v8, v10, 23
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -561,11 +561,11 @@ define <vscale x 8 x i16> @vp_ctlz_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v12, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -585,8 +585,8 @@ define <vscale x 8 x i16> @vp_ctlz_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vnsrl.wi v8, v12, 23
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -608,11 +608,11 @@ define <vscale x 16 x i16> @vp_ctlz_nxv16i16(<vscale x 16 x i16> %va, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v16, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -632,8 +632,8 @@ define <vscale x 16 x i16> @vp_ctlz_nxv16i16_unmasked(<vscale x 16 x i16> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vnsrl.wi v8, v16, 23
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -655,7 +655,9 @@ define <vscale x 32 x i16> @vp_ctlz_nxv32i16(<vscale x 32 x i16> %va, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
@@ -664,20 +666,18 @@ define <vscale x 32 x i16> @vp_ctlz_nxv32i16(<vscale x 32 x i16> %va, <vscale x
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -698,7 +698,9 @@ define <vscale x 32 x i16> @vp_ctlz_nxv32i16_unmasked(<vscale x 32 x i16> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
@@ -707,20 +709,18 @@ define <vscale x 32 x i16> @vp_ctlz_nxv32i16_unmasked(<vscale x 32 x i16> %va, i
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -746,9 +746,9 @@ define <vscale x 1 x i32> @vp_ctlz_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -795,9 +795,9 @@ define <vscale x 2 x i32> @vp_ctlz_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v10, a0, v0.t
+; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vrsub.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -844,9 +844,9 @@ define <vscale x 4 x i32> @vp_ctlz_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v12, a0, v0.t
+; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vrsub.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -893,9 +893,9 @@ define <vscale x 8 x i32> @vp_ctlz_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vrsub.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -940,8 +940,8 @@ define <vscale x 16 x i32> @vp_ctlz_nxv16i32(<vscale x 16 x i32> %va, <vscale x
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    li a0, 158
+; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -963,8 +963,8 @@ define <vscale x 16 x i32> @vp_ctlz_nxv16i32_unmasked(<vscale x 16 x i32> %va, i
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
-; CHECK-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-NEXT:    li a0, 158
+; CHECK-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vminu.vx v8, v8, a0
@@ -1237,20 +1237,20 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    fsrmi a4, 1
+; CHECK-NEXT:    li a2, 52
+; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    sub a5, a0, a1
+; CHECK-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
+; CHECK-NEXT:    sltu a3, a0, a5
 ; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    fsrmi a3, 1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    and a5, a3, a5
+; CHECK-NEXT:    li a3, 1086
+; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v16, v16, v0.t
-; CHECK-NEXT:    fsrm a3
-; CHECK-NEXT:    li a2, 52
+; CHECK-NEXT:    fsrm a4
 ; CHECK-NEXT:    vsrl.vx v16, v16, a2, v0.t
-; CHECK-NEXT:    li a3, 1086
 ; CHECK-NEXT:    vrsub.vx v16, v16, a3, v0.t
 ; CHECK-NEXT:    li a4, 64
 ; CHECK-NEXT:    vminu.vx v16, v16, a4, v0.t
@@ -1273,12 +1273,12 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
-; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    sub a3, a0, a1
+; CHECK-ZVBB-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
-; CHECK-ZVBB-NEXT:    addi a3, a3, -1
-; CHECK-ZVBB-NEXT:    and a2, a3, a2
+; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    addi a2, a2, -1
+; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v16, v16, v0.t
 ; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB46_2
@@ -1297,17 +1297,17 @@ define <vscale x 16 x i64> @vp_ctlz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; CHECK-LABEL: vp_ctlz_nxv16i64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    fsrmi a3, 1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
-; CHECK-NEXT:    fsrm a3
+; CHECK-NEXT:    fsrmi a4, 1
 ; CHECK-NEXT:    li a2, 52
-; CHECK-NEXT:    vsrl.vx v16, v16, a2
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    sltu a5, a0, a3
+; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    and a5, a5, a3
 ; CHECK-NEXT:    li a3, 1086
+; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
+; CHECK-NEXT:    fsrm a4
+; CHECK-NEXT:    vsrl.vx v16, v16, a2
 ; CHECK-NEXT:    vrsub.vx v16, v16, a3
 ; CHECK-NEXT:    li a4, 64
 ; CHECK-NEXT:    vminu.vx v16, v16, a4
@@ -1349,6 +1349,7 @@ define <vscale x 1 x i8> @vp_ctlz_zero_undef_nxv1i8(<vscale x 1 x i8> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
@@ -1356,7 +1357,6 @@ define <vscale x 1 x i8> @vp_ctlz_zero_undef_nxv1i8(<vscale x 1 x i8> %va, <vsca
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1397,6 +1397,7 @@ define <vscale x 2 x i8> @vp_ctlz_zero_undef_nxv2i8(<vscale x 2 x i8> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
@@ -1404,7 +1405,6 @@ define <vscale x 2 x i8> @vp_ctlz_zero_undef_nxv2i8(<vscale x 2 x i8> %va, <vsca
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1445,6 +1445,7 @@ define <vscale x 4 x i8> @vp_ctlz_zero_undef_nxv4i8(<vscale x 4 x i8> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v10, 23, v0.t
@@ -1452,7 +1453,6 @@ define <vscale x 4 x i8> @vp_ctlz_zero_undef_nxv4i8(<vscale x 4 x i8> %va, <vsca
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v10, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1493,6 +1493,7 @@ define <vscale x 8 x i8> @vp_ctlz_zero_undef_nxv8i8(<vscale x 8 x i8> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v12, 23, v0.t
@@ -1500,7 +1501,6 @@ define <vscale x 8 x i8> @vp_ctlz_zero_undef_nxv8i8(<vscale x 8 x i8> %va, <vsca
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v12, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1541,6 +1541,7 @@ define <vscale x 16 x i8> @vp_ctlz_zero_undef_nxv16i8(<vscale x 16 x i8> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v12, v8, v0.t
+; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v16, 23, v0.t
@@ -1548,7 +1549,6 @@ define <vscale x 16 x i8> @vp_ctlz_zero_undef_nxv16i8(<vscale x 16 x i8> %va, <v
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
-; CHECK-NEXT:    li a0, 134
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1589,6 +1589,7 @@ define <vscale x 32 x i8> @vp_ctlz_zero_undef_nxv32i8(<vscale x 32 x i8> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
@@ -1596,10 +1597,9 @@ define <vscale x 32 x i8> @vp_ctlz_zero_undef_nxv32i8(<vscale x 32 x i8> %va, <v
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -1623,6 +1623,7 @@ define <vscale x 32 x i8> @vp_ctlz_zero_undef_nxv32i8_unmasked(<vscale x 32 x i8
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v12
@@ -1630,10 +1631,9 @@ define <vscale x 32 x i8> @vp_ctlz_zero_undef_nxv32i8_unmasked(<vscale x 32 x i8
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -1658,6 +1658,7 @@ define <vscale x 64 x i8> @vp_ctlz_zero_undef_nxv64i8(<vscale x 64 x i8> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1665,10 +1666,9 @@ define <vscale x 64 x i8> @vp_ctlz_zero_undef_nxv64i8(<vscale x 64 x i8> %va, <v
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -1692,6 +1692,7 @@ define <vscale x 64 x i8> @vp_ctlz_zero_undef_nxv64i8_unmasked(<vscale x 64 x i8
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v16
@@ -1699,10 +1700,9 @@ define <vscale x 64 x i8> @vp_ctlz_zero_undef_nxv64i8_unmasked(<vscale x 64 x i8
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -1727,11 +1727,11 @@ define <vscale x 1 x i16> @vp_ctlz_zero_undef_nxv1i16(<vscale x 1 x i16> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1769,11 +1769,11 @@ define <vscale x 2 x i16> @vp_ctlz_zero_undef_nxv2i16(<vscale x 2 x i16> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1811,11 +1811,11 @@ define <vscale x 4 x i16> @vp_ctlz_zero_undef_nxv4i16(<vscale x 4 x i16> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v10, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1853,11 +1853,11 @@ define <vscale x 8 x i16> @vp_ctlz_zero_undef_nxv8i16(<vscale x 8 x i16> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v12, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1895,11 +1895,11 @@ define <vscale x 16 x i16> @vp_ctlz_zero_undef_nxv16i16(<vscale x 16 x i16> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v16, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -1937,7 +1937,9 @@ define <vscale x 32 x i16> @vp_ctlz_zero_undef_nxv32i16(<vscale x 32 x i16> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
@@ -1946,20 +1948,18 @@ define <vscale x 32 x i16> @vp_ctlz_zero_undef_nxv32i16(<vscale x 32 x i16> %va,
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -1980,7 +1980,9 @@ define <vscale x 32 x i16> @vp_ctlz_zero_undef_nxv32i16_unmasked(<vscale x 32 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
@@ -1989,20 +1991,18 @@ define <vscale x 32 x i16> @vp_ctlz_zero_undef_nxv32i16_unmasked(<vscale x 32 x
 ; CHECK-NEXT:    vor.vv v8, v8, v16
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -2027,9 +2027,9 @@ define <vscale x 1 x i32> @vp_ctlz_zero_undef_nxv1i32(<vscale x 1 x i32> %va, <v
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2071,9 +2071,9 @@ define <vscale x 2 x i32> @vp_ctlz_zero_undef_nxv2i32(<vscale x 2 x i32> %va, <v
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v10, a0, v0.t
+; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vrsub.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2115,9 +2115,9 @@ define <vscale x 4 x i32> @vp_ctlz_zero_undef_nxv4i32(<vscale x 4 x i32> %va, <v
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v12, a0, v0.t
+; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vrsub.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2159,9 +2159,9 @@ define <vscale x 8 x i32> @vp_ctlz_zero_undef_nxv8i32(<vscale x 8 x i32> %va, <v
 ; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1054
 ; CHECK-NEXT:    vrsub.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2201,8 +2201,8 @@ define <vscale x 16 x i32> @vp_ctlz_zero_undef_nxv16i32(<vscale x 16 x i32> %va,
 ; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    li a0, 158
+; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    ret
@@ -2467,18 +2467,18 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    fsrmi a3, 1
 ; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    sub a4, a0, a1
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    fsrmi a3, 1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    sltu a2, a0, a4
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a4, a2, a4
+; CHECK-NEXT:    li a2, 52
+; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v16, v16, v0.t
 ; CHECK-NEXT:    fsrm a3
-; CHECK-NEXT:    li a2, 52
 ; CHECK-NEXT:    vsrl.vx v16, v16, a2, v0.t
 ; CHECK-NEXT:    li a3, 1086
 ; CHECK-NEXT:    vrsub.vx v16, v16, a3, v0.t
@@ -2500,12 +2500,12 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
-; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    sub a3, a0, a1
+; CHECK-ZVBB-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
-; CHECK-ZVBB-NEXT:    addi a3, a3, -1
-; CHECK-ZVBB-NEXT:    and a2, a3, a2
+; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    addi a2, a2, -1
+; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vclz.v v16, v16, v0.t
 ; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB94_2
@@ -2524,15 +2524,15 @@ define <vscale x 16 x i64> @vp_ctlz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
 ; CHECK-LABEL: vp_ctlz_zero_undef_nxv16i64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    fsrmi a3, 1
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a4, a4, a2
+; CHECK-NEXT:    li a2, 52
+; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
 ; CHECK-NEXT:    fsrm a3
-; CHECK-NEXT:    li a2, 52
 ; CHECK-NEXT:    vsrl.vx v16, v16, a2
 ; CHECK-NEXT:    li a3, 1086
 ; CHECK-NEXT:    vrsub.vx v16, v16, a3
@@ -2576,12 +2576,12 @@ define <vscale x 1 x i9> @vp_ctlz_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1
 ; CHECK-NEXT:    li a1, 511
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:    vminu.vx v8, v8, a0, v0.t
@@ -2606,12 +2606,12 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2629,13 +2629,13 @@ define <vscale x 1 x i9> @vp_ctlo_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2686,13 +2686,13 @@ define <vscale x 1 x i9> @vp_ctlo_nxv1i9_vp_xor(<vscale x 1 x i9> %va, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2743,14 +2743,14 @@ define <vscale x 1 x i9> @vp_ctlo_zero_nxv1i9_unpredicated_ctlz_with_vp_xor(<vsc
 ; CHECK-NEXT:    li a1, 511
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vxor.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    li a0, 142
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
 ; CHECK-NEXT:    vnsrl.wi v8, v9, 23
-; CHECK-NEXT:    li a0, 142
 ; CHECK-NEXT:    vrsub.vx v8, v8, a0
-; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:    vminu.vx v8, v8, a0
+; CHECK-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 7
 ; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
index c310274d685081..fa8e332e5076de 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll
@@ -11,8 +11,8 @@ define <vscale x 1 x i8> @ctpop_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -39,8 +39,8 @@ define <vscale x 2 x i8> @ctpop_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -67,8 +67,8 @@ define <vscale x 4 x i8> @ctpop_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -95,8 +95,8 @@ define <vscale x 8 x i8> @ctpop_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -123,8 +123,8 @@ define <vscale x 16 x i8> @ctpop_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -151,8 +151,8 @@ define <vscale x 32 x i8> @ctpop_nxv32i8(<vscale x 32 x i8> %va) {
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -179,8 +179,8 @@ define <vscale x 64 x i8> @ctpop_nxv64i8(<vscale x 64 x i8> %va) {
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -208,17 +208,17 @@ define <vscale x 1 x i16> @ctpop_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -243,17 +243,17 @@ define <vscale x 2 x i16> @ctpop_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -278,17 +278,17 @@ define <vscale x 4 x i16> @ctpop_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -313,17 +313,17 @@ define <vscale x 8 x i16> @ctpop_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -348,17 +348,17 @@ define <vscale x 16 x i16> @ctpop_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -383,17 +383,17 @@ define <vscale x 32 x i16> @ctpop_nxv32i16(<vscale x 32 x i16> %va) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -418,17 +418,17 @@ define <vscale x 1 x i32> @ctpop_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -454,17 +454,17 @@ define <vscale x 2 x i32> @ctpop_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -490,17 +490,17 @@ define <vscale x 4 x i32> @ctpop_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -526,17 +526,17 @@ define <vscale x 8 x i32> @ctpop_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -562,17 +562,17 @@ define <vscale x 16 x i32> @ctpop_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -681,31 +681,31 @@ define <vscale x 1 x i64> @ctpop_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32-NEXT:    addi a0, a0, 1365
 ; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsub.vv v8, v8, v9
 ; RV32-NEXT:    lui a0, 209715
 ; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
+; RV32-NEXT:    vmv.v.x v10, a0
 ; RV32-NEXT:    lui a0, 61681
 ; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vmv.v.x v10, a0
 ; RV32-NEXT:    lui a0, 4112
 ; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -713,34 +713,34 @@ define <vscale x 1 x i64> @ctpop_nxv1i64(<vscale x 1 x i64> %va) {
 ;
 ; RV64-LABEL: ctpop_nxv1i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT:    vsrl.vi v9, v8, 1
 ; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
 ; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; RV64-NEXT:    vsrl.vi v9, v8, 1
 ; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vx v9, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -764,31 +764,31 @@ define <vscale x 2 x i64> @ctpop_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32-NEXT:    addi a0, a0, 1365
 ; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
 ; RV32-NEXT:    lui a0, 209715
 ; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vmv.v.x v12, a0
 ; RV32-NEXT:    lui a0, 61681
 ; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vmv.v.x v12, a0
 ; RV32-NEXT:    lui a0, 4112
 ; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -796,34 +796,34 @@ define <vscale x 2 x i64> @ctpop_nxv2i64(<vscale x 2 x i64> %va) {
 ;
 ; RV64-LABEL: ctpop_nxv2i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64-NEXT:    vsrl.vi v10, v8, 1
 ; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
 ; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m2, ta, ma
+; RV64-NEXT:    vsrl.vi v10, v8, 1
 ; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vx v10, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -847,31 +847,31 @@ define <vscale x 4 x i64> @ctpop_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32-NEXT:    addi a0, a0, 1365
 ; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsub.vv v8, v8, v12
 ; RV32-NEXT:    lui a0, 209715
 ; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
+; RV32-NEXT:    vmv.v.x v16, a0
 ; RV32-NEXT:    lui a0, 61681
 ; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vmv.v.x v16, a0
 ; RV32-NEXT:    lui a0, 4112
 ; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -879,34 +879,34 @@ define <vscale x 4 x i64> @ctpop_nxv4i64(<vscale x 4 x i64> %va) {
 ;
 ; RV64-LABEL: ctpop_nxv4i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64-NEXT:    vsrl.vi v12, v8, 1
 ; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
 ; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m4, ta, ma
+; RV64-NEXT:    vsrl.vi v12, v8, 1
 ; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vx v12, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -930,66 +930,66 @@ define <vscale x 8 x i64> @ctpop_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32-NEXT:    addi a0, a0, 1365
 ; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
 ; RV32-NEXT:    lui a0, 209715
 ; RV32-NEXT:    addi a0, a0, 819
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v16, v24
 ; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    lui a0, 61681
+; RV32-NEXT:    addi a0, a0, -241
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a0, 61681
-; RV32-NEXT:    addi a0, a0, -241
 ; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a0
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    lui a0, 4112
 ; RV32-NEXT:    addi a0, a0, 257
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a0
+; RV32-NEXT:    vmv.v.x v24, a0
 ; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ctpop_nxv8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1
 ; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
 ; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v16, v8, 1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
index 093eb0ead313ef..9e75dc9dccffde 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
@@ -17,8 +17,8 @@ define <vscale x 1 x i8> @vp_ctpop_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -44,8 +44,8 @@ define <vscale x 1 x i8> @vp_ctpop_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 ze
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -73,8 +73,8 @@ define <vscale x 2 x i8> @vp_ctpop_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -100,8 +100,8 @@ define <vscale x 2 x i8> @vp_ctpop_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 ze
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -129,8 +129,8 @@ define <vscale x 4 x i8> @vp_ctpop_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -156,8 +156,8 @@ define <vscale x 4 x i8> @vp_ctpop_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 ze
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -185,8 +185,8 @@ define <vscale x 8 x i8> @vp_ctpop_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -212,8 +212,8 @@ define <vscale x 8 x i8> @vp_ctpop_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 ze
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -241,8 +241,8 @@ define <vscale x 16 x i8> @vp_ctpop_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -268,8 +268,8 @@ define <vscale x 16 x i8> @vp_ctpop_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -297,8 +297,8 @@ define <vscale x 32 x i8> @vp_ctpop_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -324,8 +324,8 @@ define <vscale x 32 x i8> @vp_ctpop_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -353,8 +353,8 @@ define <vscale x 64 x i8> @vp_ctpop_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -380,8 +380,8 @@ define <vscale x 64 x i8> @vp_ctpop_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -410,17 +410,17 @@ define <vscale x 1 x i16> @vp_ctpop_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -444,17 +444,17 @@ define <vscale x 1 x i16> @vp_ctpop_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -480,17 +480,17 @@ define <vscale x 2 x i16> @vp_ctpop_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -514,17 +514,17 @@ define <vscale x 2 x i16> @vp_ctpop_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -550,17 +550,17 @@ define <vscale x 4 x i16> @vp_ctpop_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -584,17 +584,17 @@ define <vscale x 4 x i16> @vp_ctpop_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -620,17 +620,17 @@ define <vscale x 8 x i16> @vp_ctpop_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -654,17 +654,17 @@ define <vscale x 8 x i16> @vp_ctpop_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -690,17 +690,17 @@ define <vscale x 16 x i16> @vp_ctpop_nxv16i16(<vscale x 16 x i16> %va, <vscale x
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -724,17 +724,17 @@ define <vscale x 16 x i16> @vp_ctpop_nxv16i16_unmasked(<vscale x 16 x i16> %va,
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -760,17 +760,17 @@ define <vscale x 32 x i16> @vp_ctpop_nxv32i16(<vscale x 32 x i16> %va, <vscale x
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -794,17 +794,17 @@ define <vscale x 32 x i16> @vp_ctpop_nxv32i16_unmasked(<vscale x 32 x i16> %va,
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -830,17 +830,17 @@ define <vscale x 1 x i32> @vp_ctpop_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -865,17 +865,17 @@ define <vscale x 1 x i32> @vp_ctpop_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -902,17 +902,17 @@ define <vscale x 2 x i32> @vp_ctpop_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -937,17 +937,17 @@ define <vscale x 2 x i32> @vp_ctpop_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -974,17 +974,17 @@ define <vscale x 4 x i32> @vp_ctpop_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1009,17 +1009,17 @@ define <vscale x 4 x i32> @vp_ctpop_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1046,17 +1046,17 @@ define <vscale x 8 x i32> @vp_ctpop_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1081,17 +1081,17 @@ define <vscale x 8 x i32> @vp_ctpop_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1118,17 +1118,17 @@ define <vscale x 16 x i32> @vp_ctpop_nxv16i32(<vscale x 16 x i32> %va, <vscale x
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1153,17 +1153,17 @@ define <vscale x 16 x i32> @vp_ctpop_nxv16i32_unmasked(<vscale x 16 x i32> %va,
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1191,67 +1191,67 @@ define <vscale x 1 x i64> @vp_ctpop_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv1i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -1273,31 +1273,31 @@ define <vscale x 1 x i64> @vp_ctpop_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsub.vv v8, v8, v9
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v9, v10
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vsub.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1305,34 +1305,34 @@ define <vscale x 1 x i64> @vp_ctpop_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_ctpop_nxv1i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
+; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vx v9, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1357,67 +1357,67 @@ define <vscale x 2 x i64> @vp_ctpop_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv2i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -1439,31 +1439,31 @@ define <vscale x 2 x i64> @vp_ctpop_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1471,34 +1471,34 @@ define <vscale x 2 x i64> @vp_ctpop_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_ctpop_nxv2i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    vand.vx v10, v10, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vx v10, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1523,67 +1523,67 @@ define <vscale x 4 x i64> @vp_ctpop_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v16, v12, v16, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv4i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -1605,31 +1605,31 @@ define <vscale x 4 x i64> @vp_ctpop_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsub.vv v8, v8, v12
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v12, v16
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsub.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1637,34 +1637,34 @@ define <vscale x 4 x i64> @vp_ctpop_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ;
 ; RV64-LABEL: vp_ctpop_nxv4i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    vand.vx v12, v12, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vx v12, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1689,67 +1689,67 @@ define <vscale x 7 x i64> @vp_ctpop_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv7i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -1771,66 +1771,66 @@ define <vscale x 7 x i64> @vp_ctpop_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v16, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv7i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1855,67 +1855,67 @@ define <vscale x 8 x i64> @vp_ctpop_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv8i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -1937,66 +1937,66 @@ define <vscale x 8 x i64> @vp_ctpop_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v16, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_nxv8i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -2018,54 +2018,53 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; RV32-NEXT:    vmv1r.v v7, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    srli a2, a1, 3
-; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vx v0, v0, a2
-; RV32-NEXT:    sub a2, a0, a1
-; RV32-NEXT:    sltu a3, a0, a2
-; RV32-NEXT:    addi a3, a3, -1
-; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    srli a3, a1, 3
+; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vx v0, v0, a3
+; RV32-NEXT:    sub a3, a0, a1
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a2
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    sltu a2, a0, a3
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    and a2, a2, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a3, 349525
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    lui a3, 209715
@@ -2073,13 +2072,13 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
@@ -2088,7 +2087,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
@@ -2102,8 +2101,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a3
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -2114,7 +2112,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a3
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -2122,75 +2120,61 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    bltu a0, a1, .LBB46_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:  .LBB46_2:
 ; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv8r.v v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -2198,26 +2182,22 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
@@ -2250,39 +2230,39 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    addiw a3, a3, 819
+; RV64-NEXT:    addiw a4, a4, -241
 ; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT:    addi a7, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
+; RV64-NEXT:    slli a6, a2, 32
+; RV64-NEXT:    add a6, a2, a6
+; RV64-NEXT:    slli a2, a3, 32
+; RV64-NEXT:    add a7, a3, a2
+; RV64-NEXT:    slli a2, a4, 32
+; RV64-NEXT:    add a2, a4, a2
+; RV64-NEXT:    slli a3, a5, 32
+; RV64-NEXT:    add a3, a5, a3
+; RV64-NEXT:    li a4, 56
 ; RV64-NEXT:    sub a1, a0, a1
 ; RV64-NEXT:    sltu a0, a0, a1
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a7, v0.t
+; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a7, v0.t
+; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a4, v0.t
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv1r.v v0, v24
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a1, a1, 3
@@ -2291,17 +2271,17 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    vsub.vv v16, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v16, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v16, a7, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a7, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a0, vlenb
@@ -2317,12 +2297,12 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
-; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    sub a3, a0, a1
+; CHECK-ZVBB-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
-; CHECK-ZVBB-NEXT:    addi a3, a3, -1
-; CHECK-ZVBB-NEXT:    and a2, a3, a2
+; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    addi a2, a2, -1
+; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vcpop.v v16, v16, v0.t
 ; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB46_2
@@ -2347,27 +2327,26 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    sub a2, a0, a1
-; RV32-NEXT:    sltu a3, a0, a2
-; RV32-NEXT:    addi a3, a3, -1
-; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    sub a4, a0, a1
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a2
+; RV32-NEXT:    sltu a2, a0, a4
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    and a2, a2, a4
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v24, v16, 1
-; RV32-NEXT:    lui a3, 349525
-; RV32-NEXT:    addi a3, a3, 1365
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    mul a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vs8r.v v0, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vand.vv v24, v24, v0
 ; RV32-NEXT:    vsub.vv v16, v16, v24
-; RV32-NEXT:    lui a3, 209715
-; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v0, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2383,8 +2362,10 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV32-NEXT:    vsrl.vi v24, v16, 4
 ; RV32-NEXT:    vadd.vv v16, v16, v24
 ; RV32-NEXT:    lui a3, 61681
+; RV32-NEXT:    lui a4, 4112
 ; RV32-NEXT:    addi a3, a3, -241
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT:    addi a4, a4, 257
+; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a3
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
@@ -2393,10 +2374,8 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    lui a3, 4112
-; RV32-NEXT:    addi a3, a3, 257
-; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a3
+; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a4
 ; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -2448,59 +2427,69 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64_unmasked(<vscale x 16 x i64> %va,
 ;
 ; RV64-LABEL: vp_ctpop_nxv16i64_unmasked:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    mv a2, a0
-; RV64-NEXT:    bltu a0, a1, .LBB47_2
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB47_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:  .LBB47_2:
-; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6
-; RV64-NEXT:    sub a1, a0, a1
-; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    lui a3, 349525
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    addiw a3, a3, 1365
+; RV64-NEXT:    addiw a4, a4, 819
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw a6, a6, 257
+; RV64-NEXT:    slli a7, a3, 32
+; RV64-NEXT:    add a3, a3, a7
+; RV64-NEXT:    slli a7, a4, 32
+; RV64-NEXT:    add a4, a4, a7
+; RV64-NEXT:    slli a7, a5, 32
+; RV64-NEXT:    add a5, a5, a7
+; RV64-NEXT:    slli a7, a6, 32
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    li a7, 56
+; RV64-NEXT:    sub a2, a0, a2
+; RV64-NEXT:    sltu a0, a0, a2
 ; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    vand.vx v24, v24, a3
+; RV64-NEXT:    vsub.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    vand.vx v24, v24, a3
 ; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a3
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v8, a4
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a4
+; RV64-NEXT:    vadd.vv v8, v24, v8
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v16, a4
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vand.vx v16, v16, a4
 ; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v8, v8, a5
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vmul.vx v16, v16, a5
-; RV64-NEXT:    vsrl.vx v16, v16, a6
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v8, v8, a6
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v16, v16, a5
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v8, v8, a7
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v16, v16, a6
+; RV64-NEXT:    vsrl.vx v16, v16, a7
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i64_unmasked:
@@ -2532,21 +2521,21 @@ define <vscale x 1 x i9> @vp_ctpop_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i
 ; CHECK-NEXT:    li a1, 511
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
index 3bddcf798f66bf..5761ae0926eae4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll
@@ -13,14 +13,14 @@ define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -34,6 +34,7 @@ define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v10, v9
@@ -41,7 +42,6 @@ define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
-; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-F-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -51,6 +51,7 @@ define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v10, v9
@@ -58,7 +59,6 @@ define <vscale x 1 x i8> @cttz_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 0
-; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-D-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -79,14 +79,14 @@ define <vscale x 2 x i8> @cttz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -100,6 +100,7 @@ define <vscale x 2 x i8> @cttz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v10, v9
@@ -107,7 +108,6 @@ define <vscale x 2 x i8> @cttz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
-; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-F-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -117,6 +117,7 @@ define <vscale x 2 x i8> @cttz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v10, v9
@@ -124,7 +125,6 @@ define <vscale x 2 x i8> @cttz_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 0
-; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-D-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -145,14 +145,14 @@ define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -166,6 +166,7 @@ define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v10, v9
@@ -173,7 +174,6 @@ define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-F-NEXT:    vnsrl.wi v9, v12, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v9, v9, 0
-; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-F-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -183,6 +183,7 @@ define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v10, v9
@@ -190,7 +191,6 @@ define <vscale x 4 x i8> @cttz_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-D-NEXT:    vnsrl.wi v9, v12, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v9, v9, 0
-; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-D-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -211,14 +211,14 @@ define <vscale x 8 x i8> @cttz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -232,6 +232,7 @@ define <vscale x 8 x i8> @cttz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v10, v9
@@ -239,7 +240,6 @@ define <vscale x 8 x i8> @cttz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-F-NEXT:    vnsrl.wi v10, v12, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v9, v10, 0
-; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-F-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -249,6 +249,7 @@ define <vscale x 8 x i8> @cttz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v10, v9
@@ -256,7 +257,6 @@ define <vscale x 8 x i8> @cttz_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-D-NEXT:    vnsrl.wi v10, v12, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v9, v10, 0
-; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-D-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -277,14 +277,14 @@ define <vscale x 16 x i8> @cttz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v10, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v10, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v10, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -298,6 +298,7 @@ define <vscale x 16 x i8> @cttz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vand.vv v10, v8, v10
 ; CHECK-F-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-F-NEXT:    vzext.vf2 v12, v10
@@ -305,7 +306,6 @@ define <vscale x 16 x i8> @cttz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-F-NEXT:    vnsrl.wi v12, v16, 23
 ; CHECK-F-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-F-NEXT:    vnsrl.wi v10, v12, 0
-; CHECK-F-NEXT:    li a0, 127
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-F-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-F-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -315,6 +315,7 @@ define <vscale x 16 x i8> @cttz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vand.vv v10, v8, v10
 ; CHECK-D-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-D-NEXT:    vzext.vf2 v12, v10
@@ -322,7 +323,6 @@ define <vscale x 16 x i8> @cttz_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-D-NEXT:    vnsrl.wi v12, v16, 23
 ; CHECK-D-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-D-NEXT:    vnsrl.wi v10, v12, 0
-; CHECK-D-NEXT:    li a0, 127
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-D-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-D-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -343,14 +343,14 @@ define <vscale x 32 x i8> @cttz_nxv32i8(<vscale x 32 x i8> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vsub.vx v12, v8, a0
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vnot.v v12, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -375,14 +375,14 @@ define <vscale x 64 x i8> @cttz_nxv64i8(<vscale x 64 x i8> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vsub.vx v16, v8, a0
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vnot.v v16, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -407,24 +407,24 @@ define <vscale x 1 x i16> @cttz_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -435,28 +435,28 @@ define <vscale x 1 x i16> @cttz_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v9, v8, v9
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-F-NEXT:    vnsrl.wi v9, v10, 23
 ; CHECK-F-NEXT:    li a0, 127
-; CHECK-F-NEXT:    vsub.vx v9, v9, a0
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-F-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
-; CHECK-F-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv1i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-D-NEXT:    vand.vv v9, v8, v9
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-D-NEXT:    vnsrl.wi v9, v10, 23
 ; CHECK-D-NEXT:    li a0, 127
-; CHECK-D-NEXT:    vsub.vx v9, v9, a0
+; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-D-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-D-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
-; CHECK-D-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv1i16:
@@ -474,24 +474,24 @@ define <vscale x 2 x i16> @cttz_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -502,28 +502,28 @@ define <vscale x 2 x i16> @cttz_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v9, v8, v9
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-F-NEXT:    vnsrl.wi v9, v10, 23
 ; CHECK-F-NEXT:    li a0, 127
-; CHECK-F-NEXT:    vsub.vx v9, v9, a0
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-F-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
-; CHECK-F-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv2i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-D-NEXT:    vand.vv v9, v8, v9
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-D-NEXT:    vnsrl.wi v9, v10, 23
 ; CHECK-D-NEXT:    li a0, 127
-; CHECK-D-NEXT:    vsub.vx v9, v9, a0
+; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v8, v9
+; CHECK-D-NEXT:    vnsrl.wi v8, v8, 23
+; CHECK-D-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
-; CHECK-D-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv2i16:
@@ -541,24 +541,24 @@ define <vscale x 4 x i16> @cttz_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -569,28 +569,28 @@ define <vscale x 4 x i16> @cttz_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v9, v8, v9
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-F-NEXT:    vnsrl.wi v9, v10, 23
 ; CHECK-F-NEXT:    li a0, 127
-; CHECK-F-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v9
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v10, v8
+; CHECK-F-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
-; CHECK-F-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv4i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-D-NEXT:    vand.vv v9, v8, v9
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-D-NEXT:    vnsrl.wi v9, v10, 23
 ; CHECK-D-NEXT:    li a0, 127
-; CHECK-D-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vand.vv v8, v8, v9
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v8
+; CHECK-D-NEXT:    vnsrl.wi v8, v10, 23
+; CHECK-D-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
-; CHECK-D-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv4i16:
@@ -608,24 +608,24 @@ define <vscale x 8 x i16> @cttz_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v10, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v10, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v10, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -636,28 +636,28 @@ define <vscale x 8 x i16> @cttz_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-F-NEXT:    vand.vv v10, v8, v10
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v10
-; CHECK-F-NEXT:    vnsrl.wi v10, v12, 23
 ; CHECK-F-NEXT:    li a0, 127
-; CHECK-F-NEXT:    vsub.vx v10, v10, a0
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v10
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v12, v8
+; CHECK-F-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
-; CHECK-F-NEXT:    vmerge.vxm v8, v10, a0, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv8i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-D-NEXT:    vand.vv v10, v8, v10
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v10
-; CHECK-D-NEXT:    vnsrl.wi v10, v12, 23
 ; CHECK-D-NEXT:    li a0, 127
-; CHECK-D-NEXT:    vsub.vx v10, v10, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vand.vv v8, v8, v10
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v8
+; CHECK-D-NEXT:    vnsrl.wi v8, v12, 23
+; CHECK-D-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
-; CHECK-D-NEXT:    vmerge.vxm v8, v10, a0, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv8i16:
@@ -675,24 +675,24 @@ define <vscale x 16 x i16> @cttz_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v12, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v12, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v12, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v12, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -703,28 +703,28 @@ define <vscale x 16 x i16> @cttz_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-F-NEXT:    vand.vv v12, v8, v12
-; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v12
-; CHECK-F-NEXT:    vnsrl.wi v12, v16, 23
 ; CHECK-F-NEXT:    li a0, 127
-; CHECK-F-NEXT:    vsub.vx v12, v12, a0
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vand.vv v8, v8, v12
+; CHECK-F-NEXT:    vfwcvt.f.xu.v v16, v8
+; CHECK-F-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-F-NEXT:    li a0, 16
-; CHECK-F-NEXT:    vmerge.vxm v8, v12, a0, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-F-NEXT:    ret
 ;
 ; CHECK-D-LABEL: cttz_nxv16i16:
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-D-NEXT:    vand.vv v12, v8, v12
-; CHECK-D-NEXT:    vfwcvt.f.xu.v v16, v12
-; CHECK-D-NEXT:    vnsrl.wi v12, v16, 23
 ; CHECK-D-NEXT:    li a0, 127
-; CHECK-D-NEXT:    vsub.vx v12, v12, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vand.vv v8, v8, v12
+; CHECK-D-NEXT:    vfwcvt.f.xu.v v16, v8
+; CHECK-D-NEXT:    vnsrl.wi v8, v16, 23
+; CHECK-D-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-D-NEXT:    li a0, 16
-; CHECK-D-NEXT:    vmerge.vxm v8, v12, a0, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv16i16:
@@ -742,24 +742,24 @@ define <vscale x 32 x i16> @cttz_nxv32i16(<vscale x 32 x i16> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vsub.vx v16, v8, a0
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vnot.v v16, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -781,24 +781,24 @@ define <vscale x 1 x i32> @cttz_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -810,15 +810,15 @@ define <vscale x 1 x i32> @cttz_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfcvt.f.xu.v v9, v9
-; CHECK-F-NEXT:    vsrl.vi v9, v9, 23
 ; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vsub.vx v9, v9, a1
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v9
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -826,15 +826,15 @@ define <vscale x 1 x i32> @cttz_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vnsrl.wx v9, v10, a0
 ; CHECK-D-NEXT:    li a0, 1023
-; CHECK-D-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-D-NEXT:    li a0, 32
-; CHECK-D-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv1i32:
@@ -852,24 +852,24 @@ define <vscale x 2 x i32> @cttz_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -881,15 +881,15 @@ define <vscale x 2 x i32> @cttz_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfcvt.f.xu.v v9, v9
-; CHECK-F-NEXT:    vsrl.vi v9, v9, 23
 ; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vsub.vx v9, v9, a1
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v9
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vmerge.vxm v8, v9, a1, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -897,15 +897,15 @@ define <vscale x 2 x i32> @cttz_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v9
-; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vnsrl.wx v9, v10, a0
 ; CHECK-D-NEXT:    li a0, 1023
-; CHECK-D-NEXT:    vsub.vx v9, v9, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v9, a0
 ; CHECK-D-NEXT:    li a0, 32
-; CHECK-D-NEXT:    vmerge.vxm v8, v9, a0, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv2i32:
@@ -923,24 +923,24 @@ define <vscale x 4 x i32> @cttz_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v10, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v10, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v10, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -952,15 +952,15 @@ define <vscale x 4 x i32> @cttz_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-F-NEXT:    vand.vv v10, v8, v10
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfcvt.f.xu.v v10, v10
-; CHECK-F-NEXT:    vsrl.vi v10, v10, 23
 ; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vsub.vx v10, v10, a1
+; CHECK-F-NEXT:    vand.vv v10, v8, v10
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v10
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vmerge.vxm v8, v10, a1, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -968,15 +968,15 @@ define <vscale x 4 x i32> @cttz_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vand.vv v10, v8, v10
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v10
-; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vnsrl.wx v10, v12, a0
 ; CHECK-D-NEXT:    li a0, 1023
-; CHECK-D-NEXT:    vsub.vx v10, v10, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v10, a0
 ; CHECK-D-NEXT:    li a0, 32
-; CHECK-D-NEXT:    vmerge.vxm v8, v10, a0, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv4i32:
@@ -994,24 +994,24 @@ define <vscale x 8 x i32> @cttz_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v12, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v12, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v12, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v12, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -1023,15 +1023,15 @@ define <vscale x 8 x i32> @cttz_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-F-NEXT:    vand.vv v12, v8, v12
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfcvt.f.xu.v v12, v12
-; CHECK-F-NEXT:    vsrl.vi v12, v12, 23
 ; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vsub.vx v12, v12, a1
+; CHECK-F-NEXT:    vand.vv v12, v8, v12
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v12
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vmerge.vxm v8, v12, a1, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1039,15 +1039,15 @@ define <vscale x 8 x i32> @cttz_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vand.vv v12, v8, v12
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v16, v12
-; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vnsrl.wx v12, v16, a0
 ; CHECK-D-NEXT:    li a0, 1023
-; CHECK-D-NEXT:    vsub.vx v12, v12, a0
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v12, a0
 ; CHECK-D-NEXT:    li a0, 32
-; CHECK-D-NEXT:    vmerge.vxm v8, v12, a0, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-D-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: cttz_nxv8i32:
@@ -1065,24 +1065,24 @@ define <vscale x 16 x i32> @cttz_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v16, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v16
-; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v16, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v16, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v16, v16, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    vand.vx v16, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v16
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -1094,15 +1094,15 @@ define <vscale x 16 x i32> @cttz_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-F-NEXT:    vand.vv v16, v8, v16
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vfcvt.f.xu.v v16, v16
-; CHECK-F-NEXT:    vsrl.vi v16, v16, 23
 ; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vsub.vx v16, v16, a1
+; CHECK-F-NEXT:    vand.vv v16, v8, v16
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v16
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-F-NEXT:    li a1, 32
-; CHECK-F-NEXT:    vmerge.vxm v8, v16, a1, v0
+; CHECK-F-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1110,15 +1110,15 @@ define <vscale x 16 x i32> @cttz_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-D-NEXT:    vand.vv v16, v8, v16
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    vfcvt.f.xu.v v16, v16
-; CHECK-D-NEXT:    vsrl.vi v16, v16, 23
 ; CHECK-D-NEXT:    li a1, 127
-; CHECK-D-NEXT:    vsub.vx v16, v16, a1
+; CHECK-D-NEXT:    vand.vv v16, v8, v16
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v16
+; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-D-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-D-NEXT:    li a1, 32
-; CHECK-D-NEXT:    vmerge.vxm v8, v16, a1, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-D-NEXT:    fsrm a0
 ; CHECK-D-NEXT:    ret
 ;
@@ -1137,39 +1137,39 @@ define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    li a0, 1
 ; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vsub.vx v9, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
+; RV32I-NEXT:    vnot.v v9, v8
+; RV32I-NEXT:    vsub.vx v8, v8, a0
 ; RV32I-NEXT:    lui a0, 349525
 ; RV32I-NEXT:    addi a0, a0, 1365
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
 ; RV32I-NEXT:    lui a0, 209715
 ; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 1
+; RV32I-NEXT:    vand.vv v9, v9, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -1178,37 +1178,37 @@ define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64I-LABEL: cttz_nxv1i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
 ; RV64I-NEXT:    vsub.vx v9, v8, a0
+; RV64I-NEXT:    addiw a0, a1, 1365
+; RV64I-NEXT:    addiw a1, a2, 819
+; RV64I-NEXT:    addiw a2, a3, -241
+; RV64I-NEXT:    addiw a3, a4, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v9, v9, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v9, v8, a0
+; RV64I-NEXT:    vand.vx v9, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1217,17 +1217,17 @@ define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v9
-; CHECK-F-NEXT:    vsrl.vi v9, v10, 23
 ; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vwsubu.vx v10, v9, a1
-; CHECK-F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-F-NEXT:    vand.vv v9, v8, v9
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v8, v9
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vwsubu.vx v9, v8, a1
 ; CHECK-F-NEXT:    li a1, 64
-; CHECK-F-NEXT:    vmerge.vxm v8, v10, a1, v0
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-F-NEXT:    vmerge.vxm v8, v9, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1235,16 +1235,16 @@ define <vscale x 1 x i64> @cttz_nxv1i64(<vscale x 1 x i64> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-D-NEXT:    vand.vv v9, v8, v9
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    vfcvt.f.xu.v v9, v9
 ; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vand.vv v9, v8, v9
+; CHECK-D-NEXT:    vfcvt.f.xu.v v9, v9
 ; CHECK-D-NEXT:    vsrl.vx v9, v9, a1
 ; CHECK-D-NEXT:    li a1, 1023
-; CHECK-D-NEXT:    vsub.vx v9, v9, a1
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v9, a1
 ; CHECK-D-NEXT:    li a1, 64
-; CHECK-D-NEXT:    vmerge.vxm v8, v9, a1, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-D-NEXT:    fsrm a0
 ; CHECK-D-NEXT:    ret
 ;
@@ -1263,39 +1263,39 @@ define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    li a0, 1
 ; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vsub.vx v10, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vnot.v v10, v8
+; RV32I-NEXT:    vsub.vx v8, v8, a0
 ; RV32I-NEXT:    lui a0, 349525
 ; RV32I-NEXT:    addi a0, a0, 1365
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
 ; RV32I-NEXT:    lui a0, 209715
 ; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vand.vv v10, v10, v12
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -1304,37 +1304,37 @@ define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64I-LABEL: cttz_nxv2i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    vsetvli a5, zero, e64, m2, ta, ma
 ; RV64I-NEXT:    vsub.vx v10, v8, a0
+; RV64I-NEXT:    addiw a0, a1, 1365
+; RV64I-NEXT:    addiw a1, a2, 819
+; RV64I-NEXT:    addiw a2, a3, -241
+; RV64I-NEXT:    addiw a3, a4, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v10, v10, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v10, v8, a0
+; RV64I-NEXT:    vand.vx v10, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1343,17 +1343,17 @@ define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-F-NEXT:    vand.vv v10, v8, v10
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v10
-; CHECK-F-NEXT:    vsrl.vi v10, v12, 23
 ; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vwsubu.vx v12, v10, a1
-; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-F-NEXT:    vand.vv v10, v8, v10
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v8, v10
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vwsubu.vx v10, v8, a1
 ; CHECK-F-NEXT:    li a1, 64
-; CHECK-F-NEXT:    vmerge.vxm v8, v12, a1, v0
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-F-NEXT:    vmerge.vxm v8, v10, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1361,16 +1361,16 @@ define <vscale x 2 x i64> @cttz_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-D-NEXT:    vand.vv v10, v8, v10
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    vfcvt.f.xu.v v10, v10
 ; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vand.vv v10, v8, v10
+; CHECK-D-NEXT:    vfcvt.f.xu.v v10, v10
 ; CHECK-D-NEXT:    vsrl.vx v10, v10, a1
 ; CHECK-D-NEXT:    li a1, 1023
-; CHECK-D-NEXT:    vsub.vx v10, v10, a1
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v10, a1
 ; CHECK-D-NEXT:    li a1, 64
-; CHECK-D-NEXT:    vmerge.vxm v8, v10, a1, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-D-NEXT:    fsrm a0
 ; CHECK-D-NEXT:    ret
 ;
@@ -1389,39 +1389,39 @@ define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    li a0, 1
 ; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vsub.vx v12, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vnot.v v12, v8
+; RV32I-NEXT:    vsub.vx v8, v8, a0
 ; RV32I-NEXT:    lui a0, 349525
 ; RV32I-NEXT:    addi a0, a0, 1365
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v12, v12, v16
-; RV32I-NEXT:    vsub.vv v8, v8, v12
 ; RV32I-NEXT:    lui a0, 209715
 ; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vand.vv v8, v12, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vand.vv v12, v12, v16
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v16, v8, v12
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    vadd.vv v8, v16, v8
-; RV32I-NEXT:    vsrl.vi v12, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v12
+; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v12
+; RV32I-NEXT:    vand.vv v12, v8, v16
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v12, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v12
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    vmul.vv v8, v8, v12
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -1430,37 +1430,37 @@ define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64I-LABEL: cttz_nxv4i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    vsetvli a5, zero, e64, m4, ta, ma
 ; RV64I-NEXT:    vsub.vx v12, v8, a0
+; RV64I-NEXT:    addiw a0, a1, 1365
+; RV64I-NEXT:    addiw a1, a2, 819
+; RV64I-NEXT:    addiw a2, a3, -241
+; RV64I-NEXT:    addiw a3, a4, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v12
 ; RV64I-NEXT:    vsrl.vi v12, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v12, v12, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v12, v8, a0
+; RV64I-NEXT:    vand.vx v12, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v12, v8
 ; RV64I-NEXT:    vsrl.vi v12, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1469,17 +1469,17 @@ define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-F-NEXT:    vand.vv v12, v8, v12
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v12
-; CHECK-F-NEXT:    vsrl.vi v12, v16, 23
 ; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vwsubu.vx v16, v12, a1
-; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-F-NEXT:    vand.vv v12, v8, v12
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v8, v12
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vwsubu.vx v12, v8, a1
 ; CHECK-F-NEXT:    li a1, 64
-; CHECK-F-NEXT:    vmerge.vxm v8, v16, a1, v0
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-F-NEXT:    vmerge.vxm v8, v12, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1487,16 +1487,16 @@ define <vscale x 4 x i64> @cttz_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-D-NEXT:    vand.vv v12, v8, v12
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    vfcvt.f.xu.v v12, v12
 ; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vand.vv v12, v8, v12
+; CHECK-D-NEXT:    vfcvt.f.xu.v v12, v12
 ; CHECK-D-NEXT:    vsrl.vx v12, v12, a1
 ; CHECK-D-NEXT:    li a1, 1023
-; CHECK-D-NEXT:    vsub.vx v12, v12, a1
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v12, a1
 ; CHECK-D-NEXT:    li a1, 64
-; CHECK-D-NEXT:    vmerge.vxm v8, v12, a1, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-D-NEXT:    fsrm a0
 ; CHECK-D-NEXT:    ret
 ;
@@ -1515,40 +1515,40 @@ define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    li a0, 1
 ; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vsub.vx v16, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    vnot.v v16, v8
+; RV32I-NEXT:    vsub.vx v8, v8, a0
 ; RV32I-NEXT:    lui a0, 349525
 ; RV32I-NEXT:    addi a0, a0, 1365
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v24, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v16, v16, v24
-; RV32I-NEXT:    vsub.vv v8, v8, v16
 ; RV32I-NEXT:    lui a0, 209715
 ; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vand.vv v8, v16, v8
+; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    vand.vv v24, v16, v24
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v24
 ; RV32I-NEXT:    vand.vv v24, v8, v16
 ; RV32I-NEXT:    vsrl.vi v8, v8, 2
 ; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    vadd.vv v8, v24, v8
-; RV32I-NEXT:    vsrl.vi v16, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 61681
-; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v24, v8
+; RV32I-NEXT:    vsrl.vi v24, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v24
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
+; RV32I-NEXT:    vmv.v.x v24, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vmul.vv v8, v8, v16
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vmul.vv v8, v8, v24
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
 ; RV32I-NEXT:    ret
@@ -1556,37 +1556,37 @@ define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64I-LABEL: cttz_nxv8i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
 ; RV64I-NEXT:    vsub.vx v16, v8, a0
+; RV64I-NEXT:    addiw a0, a1, 1365
+; RV64I-NEXT:    addiw a1, a2, 819
+; RV64I-NEXT:    addiw a2, a3, -241
+; RV64I-NEXT:    addiw a3, a4, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v16
 ; RV64I-NEXT:    vsrl.vi v16, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v16, v16, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v16, v8, a0
+; RV64I-NEXT:    vand.vx v16, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v16, v8
 ; RV64I-NEXT:    vsrl.vi v16, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -1595,17 +1595,17 @@ define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-F-NEXT:    vand.vv v16, v8, v16
 ; CHECK-F-NEXT:    fsrmi a0, 1
-; CHECK-F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-F-NEXT:    vfncvt.f.xu.w v24, v16
-; CHECK-F-NEXT:    vsrl.vi v16, v24, 23
 ; CHECK-F-NEXT:    li a1, 127
-; CHECK-F-NEXT:    vwsubu.vx v24, v16, a1
-; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-F-NEXT:    vand.vv v16, v8, v16
 ; CHECK-F-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-F-NEXT:    vfncvt.f.xu.w v8, v16
+; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
+; CHECK-F-NEXT:    vwsubu.vx v16, v8, a1
 ; CHECK-F-NEXT:    li a1, 64
-; CHECK-F-NEXT:    vmerge.vxm v8, v24, a1, v0
+; CHECK-F-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-F-NEXT:    vmerge.vxm v8, v16, a1, v0
 ; CHECK-F-NEXT:    fsrm a0
 ; CHECK-F-NEXT:    ret
 ;
@@ -1613,16 +1613,16 @@ define <vscale x 8 x i64> @cttz_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-D-NEXT:    vand.vv v16, v8, v16
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    vfcvt.f.xu.v v16, v16
 ; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vand.vv v16, v8, v16
+; CHECK-D-NEXT:    vfcvt.f.xu.v v16, v16
 ; CHECK-D-NEXT:    vsrl.vx v16, v16, a1
 ; CHECK-D-NEXT:    li a1, 1023
-; CHECK-D-NEXT:    vsub.vx v16, v16, a1
 ; CHECK-D-NEXT:    vmseq.vi v0, v8, 0
+; CHECK-D-NEXT:    vsub.vx v8, v16, a1
 ; CHECK-D-NEXT:    li a1, 64
-; CHECK-D-NEXT:    vmerge.vxm v8, v16, a1, v0
+; CHECK-D-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; CHECK-D-NEXT:    fsrm a0
 ; CHECK-D-NEXT:    ret
 ;
@@ -1641,14 +1641,14 @@ define <vscale x 1 x i8> @cttz_zero_undef_nxv1i8(<vscale x 1 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1702,14 +1702,14 @@ define <vscale x 2 x i8> @cttz_zero_undef_nxv2i8(<vscale x 2 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1763,14 +1763,14 @@ define <vscale x 4 x i8> @cttz_zero_undef_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1824,14 +1824,14 @@ define <vscale x 8 x i8> @cttz_zero_undef_nxv8i8(<vscale x 8 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1885,14 +1885,14 @@ define <vscale x 16 x i8> @cttz_zero_undef_nxv16i8(<vscale x 16 x i8> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v10, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v10, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 85
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v10, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    li a0, 51
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
@@ -1946,14 +1946,14 @@ define <vscale x 32 x i8> @cttz_zero_undef_nxv32i8(<vscale x 32 x i8> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vsub.vx v12, v8, a0
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vnot.v v12, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -1977,14 +1977,14 @@ define <vscale x 64 x i8> @cttz_zero_undef_nxv64i8(<vscale x 64 x i8> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vsub.vx v16, v8, a0
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vnot.v v16, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2008,24 +2008,24 @@ define <vscale x 1 x i16> @cttz_zero_undef_nxv1i16(<vscale x 1 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2068,24 +2068,24 @@ define <vscale x 2 x i16> @cttz_zero_undef_nxv2i16(<vscale x 2 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2128,24 +2128,24 @@ define <vscale x 4 x i16> @cttz_zero_undef_nxv4i16(<vscale x 4 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2188,24 +2188,24 @@ define <vscale x 8 x i16> @cttz_zero_undef_nxv8i16(<vscale x 8 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v10, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v10, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v10, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2248,24 +2248,24 @@ define <vscale x 16 x i16> @cttz_zero_undef_nxv16i16(<vscale x 16 x i16> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v12, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v12, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 5
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v12, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v12, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    lui a0, 3
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 1
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    lui a0, 1
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    li a0, 257
 ; CHECK-ZVE64X-NEXT:    vmul.vx v8, v8, a0
@@ -2308,24 +2308,24 @@ define <vscale x 32 x i16> @cttz_zero_undef_nxv32i16(<vscale x 32 x i16> %va) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vsub.vx v16, v8, a0
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vnot.v v16, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -2346,24 +2346,24 @@ define <vscale x 1 x i32> @cttz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2375,8 +2375,8 @@ define <vscale x 1 x i32> @cttz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v9
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vand.vv v8, v8, v9
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 127
@@ -2388,9 +2388,9 @@ define <vscale x 1 x i32> @cttz_zero_undef_nxv1i32(<vscale x 1 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vand.vv v8, v8, v9
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v9, v8
-; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vnsrl.wx v8, v9, a0
 ; CHECK-D-NEXT:    li a0, 1023
 ; CHECK-D-NEXT:    vsub.vx v8, v8, a0
@@ -2410,24 +2410,24 @@ define <vscale x 2 x i32> @cttz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v9, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v9, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v9, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v9, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-ZVE64X-NEXT:    vand.vx v9, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v9
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2439,8 +2439,8 @@ define <vscale x 2 x i32> @cttz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v9
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vand.vv v8, v8, v9
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 127
@@ -2452,9 +2452,9 @@ define <vscale x 2 x i32> @cttz_zero_undef_nxv2i32(<vscale x 2 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vand.vv v8, v8, v9
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vnsrl.wx v8, v10, a0
 ; CHECK-D-NEXT:    li a0, 1023
 ; CHECK-D-NEXT:    vsub.vx v8, v8, a0
@@ -2474,24 +2474,24 @@ define <vscale x 4 x i32> @cttz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v10, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v10, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v10, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v10, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-ZVE64X-NEXT:    vand.vx v10, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v10
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2503,8 +2503,8 @@ define <vscale x 4 x i32> @cttz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v10
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vand.vv v8, v8, v10
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 127
@@ -2516,9 +2516,9 @@ define <vscale x 4 x i32> @cttz_zero_undef_nxv4i32(<vscale x 4 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vand.vv v8, v8, v10
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vnsrl.wx v8, v12, a0
 ; CHECK-D-NEXT:    li a0, 1023
 ; CHECK-D-NEXT:    vsub.vx v8, v8, a0
@@ -2538,24 +2538,24 @@ define <vscale x 8 x i32> @cttz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v12, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v12, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v12, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v12, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-ZVE64X-NEXT:    vand.vx v12, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v12
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2567,8 +2567,8 @@ define <vscale x 8 x i32> @cttz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v12
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vand.vv v8, v8, v12
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 127
@@ -2580,9 +2580,9 @@ define <vscale x 8 x i32> @cttz_zero_undef_nxv8i32(<vscale x 8 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vand.vv v8, v8, v12
 ; CHECK-D-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-D-NEXT:    li a0, 52
 ; CHECK-D-NEXT:    vnsrl.wx v8, v16, a0
 ; CHECK-D-NEXT:    li a0, 1023
 ; CHECK-D-NEXT:    vsub.vx v8, v8, a0
@@ -2602,24 +2602,24 @@ define <vscale x 16 x i32> @cttz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-ZVE64X:       # %bb.0:
 ; CHECK-ZVE64X-NEXT:    li a0, 1
 ; CHECK-ZVE64X-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; CHECK-ZVE64X-NEXT:    vsub.vx v16, v8, a0
-; CHECK-ZVE64X-NEXT:    vnot.v v8, v8
-; CHECK-ZVE64X-NEXT:    vand.vv v8, v8, v16
-; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-ZVE64X-NEXT:    vnot.v v16, v8
+; CHECK-ZVE64X-NEXT:    vsub.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 349525
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 1365
+; CHECK-ZVE64X-NEXT:    vand.vv v8, v16, v8
+; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-ZVE64X-NEXT:    vand.vx v16, v16, a0
-; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    lui a0, 209715
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 819
+; CHECK-ZVE64X-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-ZVE64X-NEXT:    vand.vx v16, v8, a0
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
+; CHECK-ZVE64X-NEXT:    lui a0, 61681
+; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-ZVE64X-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-ZVE64X-NEXT:    vadd.vv v8, v8, v16
-; CHECK-ZVE64X-NEXT:    lui a0, 61681
-; CHECK-ZVE64X-NEXT:    addi a0, a0, -241
 ; CHECK-ZVE64X-NEXT:    vand.vx v8, v8, a0
 ; CHECK-ZVE64X-NEXT:    lui a0, 4112
 ; CHECK-ZVE64X-NEXT:    addi a0, a0, 257
@@ -2631,8 +2631,8 @@ define <vscale x 16 x i32> @cttz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v16
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vand.vv v8, v8, v16
 ; CHECK-F-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-F-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-F-NEXT:    li a1, 127
@@ -2644,8 +2644,8 @@ define <vscale x 16 x i32> @cttz_zero_undef_nxv16i32(<vscale x 16 x i32> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-D-NEXT:    vand.vv v8, v8, v16
 ; CHECK-D-NEXT:    fsrmi a0, 1
+; CHECK-D-NEXT:    vand.vv v8, v8, v16
 ; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-D-NEXT:    li a1, 127
@@ -2667,39 +2667,39 @@ define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    li a0, 1
 ; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vsub.vx v9, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
+; RV32I-NEXT:    vnot.v v9, v8
+; RV32I-NEXT:    vsub.vx v8, v8, a0
 ; RV32I-NEXT:    lui a0, 349525
 ; RV32I-NEXT:    addi a0, a0, 1365
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
 ; RV32I-NEXT:    lui a0, 209715
 ; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 1
+; RV32I-NEXT:    vand.vv v9, v9, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -2708,37 +2708,37 @@ define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; RV64I-LABEL: cttz_zero_undef_nxv1i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    vsetvli a5, zero, e64, m1, ta, ma
 ; RV64I-NEXT:    vsub.vx v9, v8, a0
+; RV64I-NEXT:    addiw a0, a1, 1365
+; RV64I-NEXT:    addiw a1, a2, 819
+; RV64I-NEXT:    addiw a2, a3, -241
+; RV64I-NEXT:    addiw a3, a4, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v9, v9, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v9, v8, a0
+; RV64I-NEXT:    vand.vx v9, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -2747,8 +2747,8 @@ define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v9
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vand.vv v8, v8, v9
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v9, v8
 ; CHECK-F-NEXT:    vsrl.vi v9, v9, 23
@@ -2761,10 +2761,10 @@ define <vscale x 1 x i64> @cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-D-NEXT:    vand.vv v8, v8, v9
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vand.vv v8, v8, v9
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-D-NEXT:    li a1, 1023
 ; CHECK-D-NEXT:    vsub.vx v8, v8, a1
@@ -2785,39 +2785,39 @@ define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    li a0, 1
 ; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vsub.vx v10, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vnot.v v10, v8
+; RV32I-NEXT:    vsub.vx v8, v8, a0
 ; RV32I-NEXT:    lui a0, 349525
 ; RV32I-NEXT:    addi a0, a0, 1365
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
 ; RV32I-NEXT:    lui a0, 209715
 ; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vand.vv v10, v10, v12
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -2826,37 +2826,37 @@ define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; RV64I-LABEL: cttz_zero_undef_nxv2i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    vsetvli a5, zero, e64, m2, ta, ma
 ; RV64I-NEXT:    vsub.vx v10, v8, a0
+; RV64I-NEXT:    addiw a0, a1, 1365
+; RV64I-NEXT:    addiw a1, a2, 819
+; RV64I-NEXT:    addiw a2, a3, -241
+; RV64I-NEXT:    addiw a3, a4, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v10, v10, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v10, v8, a0
+; RV64I-NEXT:    vand.vx v10, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -2865,8 +2865,8 @@ define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v10
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vand.vv v8, v8, v10
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v10, v8
 ; CHECK-F-NEXT:    vsrl.vi v10, v10, 23
@@ -2879,10 +2879,10 @@ define <vscale x 2 x i64> @cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-D-NEXT:    vand.vv v8, v8, v10
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vand.vv v8, v8, v10
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-D-NEXT:    li a1, 1023
 ; CHECK-D-NEXT:    vsub.vx v8, v8, a1
@@ -2903,39 +2903,39 @@ define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    li a0, 1
 ; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vsub.vx v12, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vnot.v v12, v8
+; RV32I-NEXT:    vsub.vx v8, v8, a0
 ; RV32I-NEXT:    lui a0, 349525
 ; RV32I-NEXT:    addi a0, a0, 1365
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v12, v12, v16
-; RV32I-NEXT:    vsub.vv v8, v8, v12
 ; RV32I-NEXT:    lui a0, 209715
 ; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vand.vv v8, v12, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vand.vv v12, v12, v16
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v16, v8, v12
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v12
-; RV32I-NEXT:    vadd.vv v8, v16, v8
-; RV32I-NEXT:    vsrl.vi v12, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v12
+; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    lui a0, 61681
 ; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v12
+; RV32I-NEXT:    vand.vv v12, v8, v16
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vmv.v.x v16, a0
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v12, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v12
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32I-NEXT:    vmv.v.x v12, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    vmul.vv v8, v8, v12
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
@@ -2944,37 +2944,37 @@ define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; RV64I-LABEL: cttz_zero_undef_nxv4i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    vsetvli a5, zero, e64, m4, ta, ma
 ; RV64I-NEXT:    vsub.vx v12, v8, a0
+; RV64I-NEXT:    addiw a0, a1, 1365
+; RV64I-NEXT:    addiw a1, a2, 819
+; RV64I-NEXT:    addiw a2, a3, -241
+; RV64I-NEXT:    addiw a3, a4, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v12
 ; RV64I-NEXT:    vsrl.vi v12, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v12, v12, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v12, v8, a0
+; RV64I-NEXT:    vand.vx v12, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v12, v8
 ; RV64I-NEXT:    vsrl.vi v12, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v12
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -2983,8 +2983,8 @@ define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v12
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vand.vv v8, v8, v12
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v12, v8
 ; CHECK-F-NEXT:    vsrl.vi v12, v12, 23
@@ -2997,10 +2997,10 @@ define <vscale x 4 x i64> @cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-D-NEXT:    vand.vv v8, v8, v12
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vand.vv v8, v8, v12
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-D-NEXT:    li a1, 1023
 ; CHECK-D-NEXT:    vsub.vx v8, v8, a1
@@ -3021,40 +3021,40 @@ define <vscale x 8 x i64> @cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    li a0, 1
 ; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vsub.vx v16, v8, a0
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    vnot.v v16, v8
+; RV32I-NEXT:    vsub.vx v8, v8, a0
 ; RV32I-NEXT:    lui a0, 349525
 ; RV32I-NEXT:    addi a0, a0, 1365
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v24, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v16, v16, v24
-; RV32I-NEXT:    vsub.vv v8, v8, v16
 ; RV32I-NEXT:    lui a0, 209715
 ; RV32I-NEXT:    addi a0, a0, 819
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vand.vv v8, v16, v8
+; RV32I-NEXT:    vsrl.vi v16, v8, 1
+; RV32I-NEXT:    vand.vv v24, v16, v24
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32I-NEXT:    lui a0, 61681
+; RV32I-NEXT:    addi a0, a0, -241
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vsub.vv v8, v8, v24
 ; RV32I-NEXT:    vand.vv v24, v8, v16
 ; RV32I-NEXT:    vsrl.vi v8, v8, 2
 ; RV32I-NEXT:    vand.vv v8, v8, v16
-; RV32I-NEXT:    vadd.vv v8, v24, v8
-; RV32I-NEXT:    vsrl.vi v16, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v16
-; RV32I-NEXT:    lui a0, 61681
-; RV32I-NEXT:    addi a0, a0, -241
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
 ; RV32I-NEXT:    vmv.v.x v16, a0
-; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v16
 ; RV32I-NEXT:    lui a0, 4112
 ; RV32I-NEXT:    addi a0, a0, 257
+; RV32I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v24, v8
+; RV32I-NEXT:    vsrl.vi v24, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v24
 ; RV32I-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32I-NEXT:    vmv.v.x v16, a0
+; RV32I-NEXT:    vmv.v.x v24, a0
 ; RV32I-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV32I-NEXT:    vmul.vv v8, v8, v16
+; RV32I-NEXT:    vand.vv v8, v8, v16
+; RV32I-NEXT:    vmul.vv v8, v8, v24
 ; RV32I-NEXT:    li a0, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a0
 ; RV32I-NEXT:    ret
@@ -3062,37 +3062,37 @@ define <vscale x 8 x i64> @cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; RV64I-LABEL: cttz_zero_undef_nxv8i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    vsetvli a5, zero, e64, m8, ta, ma
 ; RV64I-NEXT:    vsub.vx v16, v8, a0
+; RV64I-NEXT:    addiw a0, a1, 1365
+; RV64I-NEXT:    addiw a1, a2, 819
+; RV64I-NEXT:    addiw a2, a3, -241
+; RV64I-NEXT:    addiw a3, a4, 257
+; RV64I-NEXT:    slli a4, a0, 32
+; RV64I-NEXT:    add a0, a0, a4
+; RV64I-NEXT:    slli a4, a1, 32
+; RV64I-NEXT:    add a1, a1, a4
+; RV64I-NEXT:    slli a4, a2, 32
+; RV64I-NEXT:    add a2, a2, a4
+; RV64I-NEXT:    slli a4, a3, 32
+; RV64I-NEXT:    add a3, a3, a4
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v16
 ; RV64I-NEXT:    vsrl.vi v16, v8, 1
-; RV64I-NEXT:    lui a0, 349525
-; RV64I-NEXT:    addiw a0, a0, 1365
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    vand.vx v16, v16, a0
 ; RV64I-NEXT:    vsub.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 209715
-; RV64I-NEXT:    addiw a0, a0, 819
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v16, v8, a0
+; RV64I-NEXT:    vand.vx v16, v8, a1
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a1
 ; RV64I-NEXT:    vadd.vv v8, v16, v8
 ; RV64I-NEXT:    vsrl.vi v16, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v16
-; RV64I-NEXT:    lui a0, 61681
-; RV64I-NEXT:    addiw a0, a0, -241
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vand.vx v8, v8, a0
-; RV64I-NEXT:    lui a0, 4112
-; RV64I-NEXT:    addiw a0, a0, 257
-; RV64I-NEXT:    slli a1, a0, 32
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    vmul.vx v8, v8, a0
+; RV64I-NEXT:    vand.vx v8, v8, a2
+; RV64I-NEXT:    vmul.vx v8, v8, a3
 ; RV64I-NEXT:    li a0, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a0
 ; RV64I-NEXT:    ret
@@ -3101,8 +3101,8 @@ define <vscale x 8 x i64> @cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-F:       # %bb.0:
 ; CHECK-F-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-F-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-F-NEXT:    vand.vv v8, v8, v16
 ; CHECK-F-NEXT:    fsrmi a0, 1
+; CHECK-F-NEXT:    vand.vv v8, v8, v16
 ; CHECK-F-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-F-NEXT:    vfncvt.f.xu.w v16, v8
 ; CHECK-F-NEXT:    vsrl.vi v16, v16, 23
@@ -3115,10 +3115,10 @@ define <vscale x 8 x i64> @cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va) {
 ; CHECK-D:       # %bb.0:
 ; CHECK-D-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-D-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-D-NEXT:    vand.vv v8, v8, v16
 ; CHECK-D-NEXT:    fsrmi a0, 1
-; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    li a1, 52
+; CHECK-D-NEXT:    vand.vv v8, v8, v16
+; CHECK-D-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-D-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-D-NEXT:    li a1, 1023
 ; CHECK-D-NEXT:    vsub.vx v8, v8, a1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
index 619c05dd8ab740..9e6295b6644171 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
@@ -16,13 +16,13 @@ define <vscale x 1 x i8> @vp_cttz_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -46,14 +46,14 @@ define <vscale x 1 x i8> @vp_cttz_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zer
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -80,13 +80,13 @@ define <vscale x 2 x i8> @vp_cttz_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -110,14 +110,14 @@ define <vscale x 2 x i8> @vp_cttz_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zer
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -144,13 +144,13 @@ define <vscale x 4 x i8> @vp_cttz_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -174,14 +174,14 @@ define <vscale x 4 x i8> @vp_cttz_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zer
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -208,13 +208,13 @@ define <vscale x 8 x i8> @vp_cttz_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -238,14 +238,14 @@ define <vscale x 8 x i8> @vp_cttz_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zer
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -272,13 +272,13 @@ define <vscale x 16 x i8> @vp_cttz_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vsub.vx v10, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -302,14 +302,14 @@ define <vscale x 16 x i8> @vp_cttz_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vsub.vx v10, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    vnot.v v10, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -336,13 +336,13 @@ define <vscale x 32 x i8> @vp_cttz_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vsub.vx v12, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -366,14 +366,14 @@ define <vscale x 32 x i8> @vp_cttz_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vsub.vx v12, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vnot.v v12, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -400,13 +400,13 @@ define <vscale x 64 x i8> @vp_cttz_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsub.vx v16, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -430,14 +430,14 @@ define <vscale x 64 x i8> @vp_cttz_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsub.vx v16, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vnot.v v16, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -464,23 +464,23 @@ define <vscale x 1 x i16> @vp_cttz_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -501,24 +501,24 @@ define <vscale x 1 x i16> @vp_cttz_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -542,23 +542,23 @@ define <vscale x 2 x i16> @vp_cttz_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -579,24 +579,24 @@ define <vscale x 2 x i16> @vp_cttz_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -620,23 +620,23 @@ define <vscale x 4 x i16> @vp_cttz_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -657,24 +657,24 @@ define <vscale x 4 x i16> @vp_cttz_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -698,23 +698,23 @@ define <vscale x 8 x i16> @vp_cttz_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsub.vx v10, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -735,24 +735,24 @@ define <vscale x 8 x i16> @vp_cttz_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsub.vx v10, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    vnot.v v10, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -776,23 +776,23 @@ define <vscale x 16 x i16> @vp_cttz_nxv16i16(<vscale x 16 x i16> %va, <vscale x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vsub.vx v12, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -813,24 +813,24 @@ define <vscale x 16 x i16> @vp_cttz_nxv16i16_unmasked(<vscale x 16 x i16> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vsub.vx v12, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vnot.v v12, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -854,23 +854,23 @@ define <vscale x 32 x i16> @vp_cttz_nxv32i16(<vscale x 32 x i16> %va, <vscale x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsub.vx v16, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -891,24 +891,24 @@ define <vscale x 32 x i16> @vp_cttz_nxv32i16_unmasked(<vscale x 32 x i16> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsub.vx v16, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vnot.v v16, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -932,23 +932,23 @@ define <vscale x 1 x i32> @vp_cttz_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -970,24 +970,24 @@ define <vscale x 1 x i32> @vp_cttz_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1012,23 +1012,23 @@ define <vscale x 2 x i32> @vp_cttz_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1050,24 +1050,24 @@ define <vscale x 2 x i32> @vp_cttz_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1092,23 +1092,23 @@ define <vscale x 4 x i32> @vp_cttz_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsub.vx v10, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1130,24 +1130,24 @@ define <vscale x 4 x i32> @vp_cttz_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsub.vx v10, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    vnot.v v10, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1172,23 +1172,23 @@ define <vscale x 8 x i32> @vp_cttz_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsub.vx v12, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1210,24 +1210,24 @@ define <vscale x 8 x i32> @vp_cttz_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsub.vx v12, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vnot.v v12, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1252,23 +1252,23 @@ define <vscale x 16 x i32> @vp_cttz_nxv16i32(<vscale x 16 x i32> %va, <vscale x
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vsub.vx v16, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1290,24 +1290,24 @@ define <vscale x 16 x i32> @vp_cttz_nxv16i32_unmasked(<vscale x 16 x i32> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vsub.vx v16, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vnot.v v16, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -1332,78 +1332,78 @@ define <vscale x 1 x i64> @vp_cttz_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsub.vx v9, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_nxv1i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsub.vx v9, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -1421,39 +1421,39 @@ define <vscale x 1 x i64> @vp_cttz_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vx v9, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    vnot.v v9, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsub.vv v8, v8, v9
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    vand.vv v9, v9, v10
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vsub.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1462,37 +1462,37 @@ define <vscale x 1 x i64> @vp_cttz_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32
 ; RV64-LABEL: vp_cttz_nxv1i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsub.vx v9, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vx v9, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1514,78 +1514,78 @@ define <vscale x 2 x i64> @vp_cttz_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsub.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_nxv2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsub.vx v10, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -1603,39 +1603,39 @@ define <vscale x 2 x i64> @vp_cttz_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vx v10, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    vnot.v v10, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    vand.vv v10, v10, v12
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1644,37 +1644,37 @@ define <vscale x 2 x i64> @vp_cttz_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32
 ; RV64-LABEL: vp_cttz_nxv2i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsub.vx v10, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vx v10, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1696,78 +1696,78 @@ define <vscale x 4 x i64> @vp_cttz_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsub.vx v12, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vand.vv v12, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v12, 1, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsub.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v12, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_nxv4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsub.vx v12, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -1785,39 +1785,39 @@ define <vscale x 4 x i64> @vp_cttz_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vx v12, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vnot.v v12, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsub.vv v8, v8, v12
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v12, v12, v16
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsub.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1826,37 +1826,37 @@ define <vscale x 4 x i64> @vp_cttz_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
 ; RV64-LABEL: vp_cttz_nxv4i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsub.vx v12, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v12
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vx v12, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1878,78 +1878,78 @@ define <vscale x 7 x i64> @vp_cttz_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_nxv7i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -1967,40 +1967,40 @@ define <vscale x 7 x i64> @vp_cttz_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vnot.v v16, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v24, v16, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
@@ -2008,37 +2008,37 @@ define <vscale x 7 x i64> @vp_cttz_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32
 ; RV64-LABEL: vp_cttz_nxv7i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -2060,78 +2060,78 @@ define <vscale x 8 x i64> @vp_cttz_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v24, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_nxv8i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
 ;
@@ -2149,40 +2149,40 @@ define <vscale x 8 x i64> @vp_cttz_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vnot.v v16, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsub.vv v8, v8, v16
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v24, v16, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v24, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vadd.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vsrl.vi v24, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
 ; RV32-NEXT:    ret
@@ -2190,37 +2190,37 @@ define <vscale x 8 x i64> @vp_cttz_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32
 ; RV64-LABEL: vp_cttz_nxv8i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -2253,37 +2253,44 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    srli a2, a1, 3
-; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vx v0, v0, a2
-; RV32-NEXT:    sub a2, a0, a1
-; RV32-NEXT:    sltu a3, a0, a2
-; RV32-NEXT:    addi a3, a3, -1
-; RV32-NEXT:    and a3, a3, a2
 ; RV32-NEXT:    li a2, 1
+; RV32-NEXT:    srli a3, a1, 3
+; RV32-NEXT:    sub a4, a0, a1
+; RV32-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vx v0, v0, a3
+; RV32-NEXT:    sltu a3, a0, a4
+; RV32-NEXT:    addi a3, a3, -1
+; RV32-NEXT:    and a3, a3, a4
+; RV32-NEXT:    lui a4, 349525
+; RV32-NEXT:    addi a4, a4, 1365
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v8, v16, a2, v0.t
 ; RV32-NEXT:    vnot.v v16, v16, v0.t
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    li a6, 48
+; RV32-NEXT:    mul a5, a5, a6
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 16
+; RV32-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    li a5, 40
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 349525
-; RV32-NEXT:    addi a4, a4, 1365
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    li a5, 24
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
@@ -2293,22 +2300,21 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
+; RV32-NEXT:    li a5, 24
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vsub.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    lui a4, 209715
 ; RV32-NEXT:    addi a4, a4, 819
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
@@ -2388,11 +2394,11 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v8, v16, a2, v0.t
-; RV32-NEXT:    vnot.v v16, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsub.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
@@ -2500,47 +2506,47 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    srli a2, a1, 3
-; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v0, v0, a2
-; RV64-NEXT:    sub a2, a0, a1
-; RV64-NEXT:    sltu a3, a0, a2
-; RV64-NEXT:    addi a3, a3, -1
-; RV64-NEXT:    and a3, a3, a2
 ; RV64-NEXT:    li a2, 1
-; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v8, v16, a2, v0.t
-; RV64-NEXT:    vnot.v v16, v16, v0.t
-; RV64-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV64-NEXT:    lui a3, 349525
-; RV64-NEXT:    addiw a3, a3, 1365
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsub.vv v16, v16, v8, v0.t
 ; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    srli a7, a1, 3
+; RV64-NEXT:    sub t0, a0, a1
+; RV64-NEXT:    addiw a3, a3, 1365
 ; RV64-NEXT:    addiw a4, a4, 819
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v16, a4, v0.t
-; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw t1, a6, 257
+; RV64-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vx v0, v0, a7
+; RV64-NEXT:    slli a7, a3, 32
+; RV64-NEXT:    add a7, a3, a7
+; RV64-NEXT:    slli a6, a4, 32
+; RV64-NEXT:    add a6, a4, a6
+; RV64-NEXT:    slli a3, a5, 32
+; RV64-NEXT:    add a3, a5, a3
+; RV64-NEXT:    slli a4, t1, 32
+; RV64-NEXT:    add a4, t1, a4
+; RV64-NEXT:    sltu a5, a0, t0
+; RV64-NEXT:    addi a5, a5, -1
+; RV64-NEXT:    and t0, a5, t0
+; RV64-NEXT:    li a5, 56
+; RV64-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
+; RV64-NEXT:    vsub.vx v8, v16, a2, v0.t
+; RV64-NEXT:    vnot.v v16, v16, v0.t
+; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a7, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a6, v0.t
+; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
+; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a5, 61681
-; RV64-NEXT:    addiw a5, a5, -241
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
-; RV64-NEXT:    lui a6, 4112
-; RV64-NEXT:    addiw a6, a6, 257
-; RV64-NEXT:    slli a7, a6, 32
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    vmul.vx v8, v8, a6, v0.t
-; RV64-NEXT:    li a7, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a7, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    addi t0, sp, 16
 ; RV64-NEXT:    vs8r.v v8, (t0) # Unknown-size Folded Spill
 ; RV64-NEXT:    bltu a0, a1, .LBB46_2
@@ -2557,17 +2563,17 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a7, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a4, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a6, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a6, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a7, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a0, vlenb
@@ -2583,12 +2589,12 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
-; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    sub a3, a0, a1
+; CHECK-ZVBB-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
-; CHECK-ZVBB-NEXT:    addi a3, a3, -1
-; CHECK-ZVBB-NEXT:    and a2, a3, a2
+; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    addi a2, a2, -1
+; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v16, v16, v0.t
 ; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB46_2
@@ -2613,31 +2619,30 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    sub a2, a0, a1
-; RV32-NEXT:    sltu a3, a0, a2
-; RV32-NEXT:    addi a3, a3, -1
-; RV32-NEXT:    and a3, a3, a2
 ; RV32-NEXT:    li a2, 1
+; RV32-NEXT:    lui a3, 349525
+; RV32-NEXT:    lui a4, 209715
+; RV32-NEXT:    sub a5, a0, a1
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    addi a4, a4, 819
+; RV32-NEXT:    vsetvli a6, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v0, a3
+; RV32-NEXT:    sltu a3, a0, a5
+; RV32-NEXT:    addi a3, a3, -1
+; RV32-NEXT:    and a3, a3, a5
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v24, v16, a2
 ; RV32-NEXT:    vnot.v v16, v16
 ; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsrl.vi v24, v16, 1
-; RV32-NEXT:    lui a4, 349525
-; RV32-NEXT:    addi a4, a4, 1365
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v0, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v0, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    li a6, 24
+; RV32-NEXT:    mul a5, a5, a6
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 16
+; RV32-NEXT:    vs8r.v v0, (a5) # Unknown-size Folded Spill
 ; RV32-NEXT:    vand.vv v24, v24, v0
 ; RV32-NEXT:    vsub.vv v16, v16, v24
-; RV32-NEXT:    lui a4, 209715
-; RV32-NEXT:    addi a4, a4, 819
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v0, a4
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
@@ -2653,8 +2658,10 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    vsrl.vi v24, v16, 4
 ; RV32-NEXT:    vadd.vv v16, v16, v24
 ; RV32-NEXT:    lui a4, 61681
+; RV32-NEXT:    lui a5, 4112
 ; RV32-NEXT:    addi a4, a4, -241
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
+; RV32-NEXT:    addi a5, a5, 257
+; RV32-NEXT:    vsetvli a6, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a4
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    slli a4, a4, 3
@@ -2663,10 +2670,8 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    lui a4, 4112
-; RV32-NEXT:    addi a4, a4, 257
-; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v24, a4
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v24, a5
 ; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
@@ -2722,42 +2727,42 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV64-LABEL: vp_cttz_nxv16i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    sub a2, a0, a1
-; RV64-NEXT:    sltu a3, a0, a2
-; RV64-NEXT:    addi a3, a3, -1
-; RV64-NEXT:    and a3, a3, a2
 ; RV64-NEXT:    li a2, 1
-; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; RV64-NEXT:    lui a3, 349525
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    sub a7, a0, a1
+; RV64-NEXT:    addiw a3, a3, 1365
+; RV64-NEXT:    addiw a4, a4, 819
+; RV64-NEXT:    addiw t0, a5, -241
+; RV64-NEXT:    addiw t1, a6, 257
+; RV64-NEXT:    slli a6, a3, 32
+; RV64-NEXT:    add a6, a3, a6
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a5, a4, a5
+; RV64-NEXT:    slli a3, t0, 32
+; RV64-NEXT:    add a3, t0, a3
+; RV64-NEXT:    slli a4, t1, 32
+; RV64-NEXT:    add a4, t1, a4
+; RV64-NEXT:    sltu t0, a0, a7
+; RV64-NEXT:    addi t0, t0, -1
+; RV64-NEXT:    and a7, t0, a7
+; RV64-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v24, v16, a2
 ; RV64-NEXT:    vnot.v v16, v16
 ; RV64-NEXT:    vand.vv v16, v16, v24
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    lui a3, 349525
-; RV64-NEXT:    addiw a3, a3, 1365
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v24, v24, a3
+; RV64-NEXT:    vand.vx v24, v24, a6
 ; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    lui a4, 209715
-; RV64-NEXT:    addiw a4, a4, 819
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v24, v16, a4
+; RV64-NEXT:    vand.vx v24, v16, a5
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a4
+; RV64-NEXT:    vand.vx v16, v16, a5
 ; RV64-NEXT:    vadd.vv v16, v24, v16
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    lui a5, 61681
-; RV64-NEXT:    addiw a5, a5, -241
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vand.vx v16, v16, a5
-; RV64-NEXT:    lui a6, 4112
-; RV64-NEXT:    addiw a6, a6, 257
-; RV64-NEXT:    slli a7, a6, 32
-; RV64-NEXT:    add a6, a6, a7
-; RV64-NEXT:    vmul.vx v16, v16, a6
+; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vmul.vx v16, v16, a4
 ; RV64-NEXT:    li a7, 56
 ; RV64-NEXT:    vsrl.vx v16, v16, a7
 ; RV64-NEXT:    bltu a0, a1, .LBB47_2
@@ -2769,16 +2774,16 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64_unmasked(<vscale x 16 x i64> %va, i
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v24
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    vand.vx v24, v24, a3
+; RV64-NEXT:    vand.vx v24, v24, a6
 ; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    vand.vx v24, v8, a4
+; RV64-NEXT:    vand.vx v24, v8, a5
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a4
+; RV64-NEXT:    vand.vx v8, v8, a5
 ; RV64-NEXT:    vadd.vv v8, v24, v8
 ; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    vand.vx v8, v8, a5
-; RV64-NEXT:    vmul.vx v8, v8, a6
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    vsrl.vx v8, v8, a7
 ; RV64-NEXT:    ret
 ;
@@ -2807,6 +2812,7 @@ define <vscale x 1 x i8> @vp_cttz_zero_undef_nxv1i8(<vscale x 1 x i8> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
@@ -2817,7 +2823,6 @@ define <vscale x 1 x i8> @vp_cttz_zero_undef_nxv1i8(<vscale x 1 x i8> %va, <vsca
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2861,6 +2866,7 @@ define <vscale x 2 x i8> @vp_cttz_zero_undef_nxv2i8(<vscale x 2 x i8> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
@@ -2871,7 +2877,6 @@ define <vscale x 2 x i8> @vp_cttz_zero_undef_nxv2i8(<vscale x 2 x i8> %va, <vsca
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2915,6 +2920,7 @@ define <vscale x 4 x i8> @vp_cttz_zero_undef_nxv4i8(<vscale x 4 x i8> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v9, v8, v0.t
@@ -2925,7 +2931,6 @@ define <vscale x 4 x i8> @vp_cttz_zero_undef_nxv4i8(<vscale x 4 x i8> %va, <vsca
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v10, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -2969,6 +2974,7 @@ define <vscale x 8 x i8> @vp_cttz_zero_undef_nxv8i8(<vscale x 8 x i8> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v10, v8, v0.t
@@ -2979,7 +2985,6 @@ define <vscale x 8 x i8> @vp_cttz_zero_undef_nxv8i8(<vscale x 8 x i8> %va, <vsca
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v12, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3023,6 +3028,7 @@ define <vscale x 16 x i8> @vp_cttz_zero_undef_nxv16i8(<vscale x 16 x i8> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v12, v8, v0.t
@@ -3033,7 +3039,6 @@ define <vscale x 16 x i8> @vp_cttz_zero_undef_nxv16i8(<vscale x 16 x i8> %va, <v
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3078,13 +3083,13 @@ define <vscale x 32 x i8> @vp_cttz_zero_undef_nxv32i8(<vscale x 32 x i8> %va, <v
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vsub.vx v12, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -3108,14 +3113,14 @@ define <vscale x 32 x i8> @vp_cttz_zero_undef_nxv32i8_unmasked(<vscale x 32 x i8
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    vsub.vx v12, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vnot.v v12, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -3141,13 +3146,13 @@ define <vscale x 64 x i8> @vp_cttz_zero_undef_nxv64i8(<vscale x 64 x i8> %va, <v
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vsub.vx v16, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -3171,14 +3176,14 @@ define <vscale x 64 x i8> @vp_cttz_zero_undef_nxv64i8_unmasked(<vscale x 64 x i8
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsub.vx v16, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vnot.v v16, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -3203,13 +3208,13 @@ define <vscale x 1 x i16> @vp_cttz_zero_undef_nxv1i16(<vscale x 1 x i16> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3249,13 +3254,13 @@ define <vscale x 2 x i16> @vp_cttz_zero_undef_nxv2i16(<vscale x 2 x i16> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3295,13 +3300,13 @@ define <vscale x 4 x i16> @vp_cttz_zero_undef_nxv4i16(<vscale x 4 x i16> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v10, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3341,13 +3346,13 @@ define <vscale x 8 x i16> @vp_cttz_zero_undef_nxv8i16(<vscale x 8 x i16> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v12, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3387,13 +3392,13 @@ define <vscale x 16 x i16> @vp_cttz_zero_undef_nxv16i16(<vscale x 16 x i16> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v16, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3434,23 +3439,23 @@ define <vscale x 32 x i16> @vp_cttz_zero_undef_nxv32i16(<vscale x 32 x i16> %va,
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsub.vx v16, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
-; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vand.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -241
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -3471,24 +3476,24 @@ define <vscale x 32 x i16> @vp_cttz_zero_undef_nxv32i16_unmasked(<vscale x 32 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsub.vx v16, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v16
-; CHECK-NEXT:    vsrl.vi v16, v8, 1
+; CHECK-NEXT:    vnot.v v16, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v16, v8
+; CHECK-NEXT:    vsrl.vi v16, v8, 1
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v16
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -3510,14 +3515,14 @@ define <vscale x 1 x i32> @vp_cttz_zero_undef_nxv1i32(<vscale x 1 x i32> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
-; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3535,9 +3540,9 @@ define <vscale x 1 x i32> @vp_cttz_zero_undef_nxv1i32_unmasked(<vscale x 1 x i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8
-; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vnsrl.wx v8, v9, a0
 ; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a0
@@ -3558,14 +3563,14 @@ define <vscale x 2 x i32> @vp_cttz_zero_undef_nxv2i32(<vscale x 2 x i32> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8, v0.t
-; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v10, a0, v0.t
+; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsub.vx v8, v10, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3583,9 +3588,9 @@ define <vscale x 2 x i32> @vp_cttz_zero_undef_nxv2i32_unmasked(<vscale x 2 x i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
+; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vfwcvt.f.xu.v v10, v8
-; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vnsrl.wx v8, v10, a0
 ; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a0
@@ -3606,14 +3611,14 @@ define <vscale x 4 x i32> @vp_cttz_zero_undef_nxv4i32(<vscale x 4 x i32> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8, v0.t
-; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v12, a0, v0.t
+; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v12, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsub.vx v8, v12, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3631,9 +3636,9 @@ define <vscale x 4 x i32> @vp_cttz_zero_undef_nxv4i32_unmasked(<vscale x 4 x i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0
+; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v10
 ; CHECK-NEXT:    vfwcvt.f.xu.v v12, v8
-; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vnsrl.wx v8, v12, a0
 ; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a0
@@ -3654,14 +3659,14 @@ define <vscale x 8 x i32> @vp_cttz_zero_undef_nxv8i32(<vscale x 8 x i32> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8, v0.t
-; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vx v8, v16, a0, v0.t
+; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsub.vx v8, v16, a0, v0.t
 ; CHECK-NEXT:    ret
 ;
@@ -3679,9 +3684,9 @@ define <vscale x 8 x i32> @vp_cttz_zero_undef_nxv8i32_unmasked(<vscale x 8 x i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0
+; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vand.vv v8, v8, v12
 ; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
-; CHECK-NEXT:    li a0, 52
 ; CHECK-NEXT:    vnsrl.wx v8, v16, a0
 ; CHECK-NEXT:    li a0, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a0
@@ -3702,11 +3707,11 @@ define <vscale x 16 x i32> @vp_cttz_zero_undef_nxv16i32(<vscale x 16 x i32> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    li a1, 127
+; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23, v0.t
-; CHECK-NEXT:    li a1, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    fsrm a0
 ; CHECK-NEXT:    ret
@@ -3725,8 +3730,8 @@ define <vscale x 16 x i32> @vp_cttz_zero_undef_nxv16i32_unmasked(<vscale x 16 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-NEXT:    vand.vv v8, v8, v16
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vand.vv v8, v8, v16
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v8, v8, 23
 ; CHECK-NEXT:    li a1, 127
@@ -3749,10 +3754,10 @@ define <vscale x 1 x i64> @vp_cttz_zero_undef_nxv1i64(<vscale x 1 x i64> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
@@ -3773,10 +3778,10 @@ define <vscale x 1 x i64> @vp_cttz_zero_undef_nxv1i64_unmasked(<vscale x 1 x i64
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0
-; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v9
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1
@@ -3798,10 +3803,10 @@ define <vscale x 2 x i64> @vp_cttz_zero_undef_nxv2i64(<vscale x 2 x i64> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
@@ -3822,10 +3827,10 @@ define <vscale x 2 x i64> @vp_cttz_zero_undef_nxv2i64_unmasked(<vscale x 2 x i64
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vrsub.vi v10, v8, 0
-; CHECK-NEXT:    vand.vv v8, v8, v10
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v10
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1
@@ -3847,10 +3852,10 @@ define <vscale x 4 x i64> @vp_cttz_zero_undef_nxv4i64(<vscale x 4 x i64> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
@@ -3871,10 +3876,10 @@ define <vscale x 4 x i64> @vp_cttz_zero_undef_nxv4i64_unmasked(<vscale x 4 x i64
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v12, v8, 0
-; CHECK-NEXT:    vand.vv v8, v8, v12
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v12
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1
@@ -3896,10 +3901,10 @@ define <vscale x 7 x i64> @vp_cttz_zero_undef_nxv7i64(<vscale x 7 x i64> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
@@ -3920,10 +3925,10 @@ define <vscale x 7 x i64> @vp_cttz_zero_undef_nxv7i64_unmasked(<vscale x 7 x i64
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-NEXT:    vand.vv v8, v8, v16
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1
@@ -3945,10 +3950,10 @@ define <vscale x 8 x i64> @vp_cttz_zero_undef_nxv8i64(<vscale x 8 x i64> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1, v0.t
@@ -3969,10 +3974,10 @@ define <vscale x 8 x i64> @vp_cttz_zero_undef_nxv8i64_unmasked(<vscale x 8 x i64
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0
-; CHECK-NEXT:    vand.vv v8, v8, v16
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    li a1, 52
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    vfcvt.f.xu.v v8, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a1
@@ -4004,20 +4009,20 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    fsrmi a3, 1
 ; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    sub a4, a0, a1
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    sltu a2, a0, a4
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a4, a2, a4
+; CHECK-NEXT:    li a2, 52
+; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v8, v16, 0, v0.t
 ; CHECK-NEXT:    vand.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    fsrmi a2, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
-; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    li a2, 52
+; CHECK-NEXT:    fsrm a3
 ; CHECK-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    li a3, 1023
 ; CHECK-NEXT:    vsub.vx v8, v8, a3, v0.t
@@ -4034,8 +4039,8 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v16, v8, 0, v0.t
-; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
+; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsub.vx v8, v8, a3, v0.t
@@ -4055,12 +4060,12 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-ZVBB-NEXT:    vmv1r.v v24, v0
 ; CHECK-ZVBB-NEXT:    csrr a1, vlenb
 ; CHECK-ZVBB-NEXT:    srli a2, a1, 3
-; CHECK-ZVBB-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-ZVBB-NEXT:    sub a3, a0, a1
+; CHECK-ZVBB-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-ZVBB-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-ZVBB-NEXT:    sub a2, a0, a1
-; CHECK-ZVBB-NEXT:    sltu a3, a0, a2
-; CHECK-ZVBB-NEXT:    addi a3, a3, -1
-; CHECK-ZVBB-NEXT:    and a2, a3, a2
+; CHECK-ZVBB-NEXT:    sltu a2, a0, a3
+; CHECK-ZVBB-NEXT:    addi a2, a2, -1
+; CHECK-ZVBB-NEXT:    and a2, a2, a3
 ; CHECK-ZVBB-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-ZVBB-NEXT:    vctz.v v16, v16, v0.t
 ; CHECK-ZVBB-NEXT:    bltu a0, a1, .LBB94_2
@@ -4079,17 +4084,17 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64_unmasked(<vscale x 16 x
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv16i64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    fsrmi a3, 1
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a4, a4, a2
+; CHECK-NEXT:    li a2, 52
+; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vrsub.vi v24, v16, 0
 ; CHECK-NEXT:    vand.vv v16, v16, v24
-; CHECK-NEXT:    fsrmi a2, 1
 ; CHECK-NEXT:    vfcvt.f.xu.v v16, v16
-; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    li a2, 52
+; CHECK-NEXT:    fsrm a3
 ; CHECK-NEXT:    vsrl.vx v16, v16, a2
 ; CHECK-NEXT:    li a3, 1023
 ; CHECK-NEXT:    vsub.vx v16, v16, a3
@@ -4161,13 +4166,13 @@ define <vscale x 1 x i9> @vp_zero_undef_cttz_nxv1i9(<vscale x 1 x i9> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vrsub.vi v9, v8, 0, v0.t
+; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vfwcvt.f.xu.v v9, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v8, v9, 23, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v8, 0, v0.t
-; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/rvv/dont-sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/dont-sink-splat-operands.ll
index 2b4b8e979f3d71..31fa5d025156f0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/dont-sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/dont-sink-splat-operands.ll
@@ -87,35 +87,35 @@ define void @sink_splat_add_scalable(ptr nocapture %a, i32 signext %x) {
 ; NO-SINK-LABEL: sink_splat_add_scalable:
 ; NO-SINK:       # %bb.0: # %entry
 ; NO-SINK-NEXT:    csrr a5, vlenb
-; NO-SINK-NEXT:    srli a2, a5, 1
-; NO-SINK-NEXT:    li a3, 1024
-; NO-SINK-NEXT:    bgeu a3, a2, .LBB1_2
+; NO-SINK-NEXT:    srli a3, a5, 1
+; NO-SINK-NEXT:    li a2, 1024
+; NO-SINK-NEXT:    bgeu a2, a3, .LBB1_2
 ; NO-SINK-NEXT:  # %bb.1:
-; NO-SINK-NEXT:    li a3, 0
+; NO-SINK-NEXT:    li a2, 0
 ; NO-SINK-NEXT:    j .LBB1_5
 ; NO-SINK-NEXT:  .LBB1_2: # %vector.ph
-; NO-SINK-NEXT:    addi a3, a2, -1
-; NO-SINK-NEXT:    andi a4, a3, 1024
-; NO-SINK-NEXT:    xori a3, a4, 1024
+; NO-SINK-NEXT:    addi a2, a3, -1
+; NO-SINK-NEXT:    andi a4, a2, 1024
+; NO-SINK-NEXT:    xori a2, a4, 1024
 ; NO-SINK-NEXT:    vsetvli a6, zero, e32, m2, ta, ma
 ; NO-SINK-NEXT:    vmv.v.x v8, a1
 ; NO-SINK-NEXT:    slli a5, a5, 1
 ; NO-SINK-NEXT:    mv a6, a0
-; NO-SINK-NEXT:    mv a7, a3
+; NO-SINK-NEXT:    mv a7, a2
 ; NO-SINK-NEXT:  .LBB1_3: # %vector.body
 ; NO-SINK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; NO-SINK-NEXT:    vl2re32.v v10, (a6)
+; NO-SINK-NEXT:    sub a7, a7, a3
 ; NO-SINK-NEXT:    vadd.vv v10, v10, v8
 ; NO-SINK-NEXT:    vs2r.v v10, (a6)
-; NO-SINK-NEXT:    sub a7, a7, a2
 ; NO-SINK-NEXT:    add a6, a6, a5
 ; NO-SINK-NEXT:    bnez a7, .LBB1_3
 ; NO-SINK-NEXT:  # %bb.4: # %middle.block
 ; NO-SINK-NEXT:    beqz a4, .LBB1_7
 ; NO-SINK-NEXT:  .LBB1_5: # %for.body.preheader
-; NO-SINK-NEXT:    slli a2, a3, 2
-; NO-SINK-NEXT:    add a2, a0, a2
+; NO-SINK-NEXT:    slli a2, a2, 2
 ; NO-SINK-NEXT:    lui a3, 1
+; NO-SINK-NEXT:    add a2, a0, a2
 ; NO-SINK-NEXT:    add a0, a0, a3
 ; NO-SINK-NEXT:  .LBB1_6: # %for.body
 ; NO-SINK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -130,34 +130,34 @@ define void @sink_splat_add_scalable(ptr nocapture %a, i32 signext %x) {
 ; SINK-LABEL: sink_splat_add_scalable:
 ; SINK:       # %bb.0: # %entry
 ; SINK-NEXT:    csrr a5, vlenb
-; SINK-NEXT:    srli a2, a5, 1
-; SINK-NEXT:    li a3, 1024
-; SINK-NEXT:    bgeu a3, a2, .LBB1_2
+; SINK-NEXT:    srli a3, a5, 1
+; SINK-NEXT:    li a2, 1024
+; SINK-NEXT:    bgeu a2, a3, .LBB1_2
 ; SINK-NEXT:  # %bb.1:
-; SINK-NEXT:    li a3, 0
+; SINK-NEXT:    li a2, 0
 ; SINK-NEXT:    j .LBB1_5
 ; SINK-NEXT:  .LBB1_2: # %vector.ph
-; SINK-NEXT:    addi a3, a2, -1
-; SINK-NEXT:    andi a4, a3, 1024
-; SINK-NEXT:    xori a3, a4, 1024
+; SINK-NEXT:    addi a2, a3, -1
+; SINK-NEXT:    andi a4, a2, 1024
+; SINK-NEXT:    xori a2, a4, 1024
 ; SINK-NEXT:    slli a5, a5, 1
 ; SINK-NEXT:    mv a6, a0
-; SINK-NEXT:    mv a7, a3
+; SINK-NEXT:    mv a7, a2
 ; SINK-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
 ; SINK-NEXT:  .LBB1_3: # %vector.body
 ; SINK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SINK-NEXT:    vl2re32.v v8, (a6)
+; SINK-NEXT:    sub a7, a7, a3
 ; SINK-NEXT:    vadd.vx v8, v8, a1
 ; SINK-NEXT:    vs2r.v v8, (a6)
-; SINK-NEXT:    sub a7, a7, a2
 ; SINK-NEXT:    add a6, a6, a5
 ; SINK-NEXT:    bnez a7, .LBB1_3
 ; SINK-NEXT:  # %bb.4: # %middle.block
 ; SINK-NEXT:    beqz a4, .LBB1_7
 ; SINK-NEXT:  .LBB1_5: # %for.body.preheader
-; SINK-NEXT:    slli a2, a3, 2
-; SINK-NEXT:    add a2, a0, a2
+; SINK-NEXT:    slli a2, a2, 2
 ; SINK-NEXT:    lui a3, 1
+; SINK-NEXT:    add a2, a0, a2
 ; SINK-NEXT:    add a0, a0, a3
 ; SINK-NEXT:  .LBB1_6: # %for.body
 ; SINK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -172,34 +172,34 @@ define void @sink_splat_add_scalable(ptr nocapture %a, i32 signext %x) {
 ; DEFAULT-LABEL: sink_splat_add_scalable:
 ; DEFAULT:       # %bb.0: # %entry
 ; DEFAULT-NEXT:    csrr a5, vlenb
-; DEFAULT-NEXT:    srli a2, a5, 1
-; DEFAULT-NEXT:    li a3, 1024
-; DEFAULT-NEXT:    bgeu a3, a2, .LBB1_2
+; DEFAULT-NEXT:    srli a3, a5, 1
+; DEFAULT-NEXT:    li a2, 1024
+; DEFAULT-NEXT:    bgeu a2, a3, .LBB1_2
 ; DEFAULT-NEXT:  # %bb.1:
-; DEFAULT-NEXT:    li a3, 0
+; DEFAULT-NEXT:    li a2, 0
 ; DEFAULT-NEXT:    j .LBB1_5
 ; DEFAULT-NEXT:  .LBB1_2: # %vector.ph
-; DEFAULT-NEXT:    addi a3, a2, -1
-; DEFAULT-NEXT:    andi a4, a3, 1024
-; DEFAULT-NEXT:    xori a3, a4, 1024
+; DEFAULT-NEXT:    addi a2, a3, -1
+; DEFAULT-NEXT:    andi a4, a2, 1024
+; DEFAULT-NEXT:    xori a2, a4, 1024
 ; DEFAULT-NEXT:    slli a5, a5, 1
 ; DEFAULT-NEXT:    mv a6, a0
-; DEFAULT-NEXT:    mv a7, a3
+; DEFAULT-NEXT:    mv a7, a2
 ; DEFAULT-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
 ; DEFAULT-NEXT:  .LBB1_3: # %vector.body
 ; DEFAULT-NEXT:    # =>This Inner Loop Header: Depth=1
 ; DEFAULT-NEXT:    vl2re32.v v8, (a6)
+; DEFAULT-NEXT:    sub a7, a7, a3
 ; DEFAULT-NEXT:    vadd.vx v8, v8, a1
 ; DEFAULT-NEXT:    vs2r.v v8, (a6)
-; DEFAULT-NEXT:    sub a7, a7, a2
 ; DEFAULT-NEXT:    add a6, a6, a5
 ; DEFAULT-NEXT:    bnez a7, .LBB1_3
 ; DEFAULT-NEXT:  # %bb.4: # %middle.block
 ; DEFAULT-NEXT:    beqz a4, .LBB1_7
 ; DEFAULT-NEXT:  .LBB1_5: # %for.body.preheader
-; DEFAULT-NEXT:    slli a2, a3, 2
-; DEFAULT-NEXT:    add a2, a0, a2
+; DEFAULT-NEXT:    slli a2, a2, 2
 ; DEFAULT-NEXT:    lui a3, 1
+; DEFAULT-NEXT:    add a2, a0, a2
 ; DEFAULT-NEXT:    add a0, a0, a3
 ; DEFAULT-NEXT:  .LBB1_6: # %for.body
 ; DEFAULT-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -407,34 +407,34 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) {
 ; NO-SINK-LABEL: sink_splat_fadd_scalable:
 ; NO-SINK:       # %bb.0: # %entry
 ; NO-SINK-NEXT:    csrr a1, vlenb
-; NO-SINK-NEXT:    srli a2, a1, 2
-; NO-SINK-NEXT:    li a3, 1024
-; NO-SINK-NEXT:    bgeu a3, a2, .LBB4_2
+; NO-SINK-NEXT:    srli a3, a1, 2
+; NO-SINK-NEXT:    li a2, 1024
+; NO-SINK-NEXT:    bgeu a2, a3, .LBB4_2
 ; NO-SINK-NEXT:  # %bb.1:
-; NO-SINK-NEXT:    li a3, 0
+; NO-SINK-NEXT:    li a2, 0
 ; NO-SINK-NEXT:    j .LBB4_5
 ; NO-SINK-NEXT:  .LBB4_2: # %vector.ph
-; NO-SINK-NEXT:    addi a3, a2, -1
-; NO-SINK-NEXT:    andi a4, a3, 1024
-; NO-SINK-NEXT:    xori a3, a4, 1024
+; NO-SINK-NEXT:    addi a2, a3, -1
+; NO-SINK-NEXT:    andi a4, a2, 1024
+; NO-SINK-NEXT:    xori a2, a4, 1024
 ; NO-SINK-NEXT:    vsetvli a5, zero, e32, m1, ta, ma
 ; NO-SINK-NEXT:    vfmv.v.f v8, fa0
 ; NO-SINK-NEXT:    mv a5, a0
-; NO-SINK-NEXT:    mv a6, a3
+; NO-SINK-NEXT:    mv a6, a2
 ; NO-SINK-NEXT:  .LBB4_3: # %vector.body
 ; NO-SINK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; NO-SINK-NEXT:    vl1re32.v v9, (a5)
+; NO-SINK-NEXT:    sub a6, a6, a3
 ; NO-SINK-NEXT:    vfadd.vv v9, v9, v8
 ; NO-SINK-NEXT:    vs1r.v v9, (a5)
-; NO-SINK-NEXT:    sub a6, a6, a2
 ; NO-SINK-NEXT:    add a5, a5, a1
 ; NO-SINK-NEXT:    bnez a6, .LBB4_3
 ; NO-SINK-NEXT:  # %bb.4: # %middle.block
 ; NO-SINK-NEXT:    beqz a4, .LBB4_7
 ; NO-SINK-NEXT:  .LBB4_5: # %for.body.preheader
-; NO-SINK-NEXT:    slli a1, a3, 2
-; NO-SINK-NEXT:    add a1, a0, a1
+; NO-SINK-NEXT:    slli a1, a2, 2
 ; NO-SINK-NEXT:    lui a2, 1
+; NO-SINK-NEXT:    add a1, a0, a1
 ; NO-SINK-NEXT:    add a0, a0, a2
 ; NO-SINK-NEXT:  .LBB4_6: # %for.body
 ; NO-SINK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -449,33 +449,33 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) {
 ; SINK-LABEL: sink_splat_fadd_scalable:
 ; SINK:       # %bb.0: # %entry
 ; SINK-NEXT:    csrr a1, vlenb
-; SINK-NEXT:    srli a2, a1, 2
-; SINK-NEXT:    li a3, 1024
-; SINK-NEXT:    bgeu a3, a2, .LBB4_2
+; SINK-NEXT:    srli a3, a1, 2
+; SINK-NEXT:    li a2, 1024
+; SINK-NEXT:    bgeu a2, a3, .LBB4_2
 ; SINK-NEXT:  # %bb.1:
-; SINK-NEXT:    li a3, 0
+; SINK-NEXT:    li a2, 0
 ; SINK-NEXT:    j .LBB4_5
 ; SINK-NEXT:  .LBB4_2: # %vector.ph
-; SINK-NEXT:    addi a3, a2, -1
-; SINK-NEXT:    andi a4, a3, 1024
-; SINK-NEXT:    xori a3, a4, 1024
+; SINK-NEXT:    addi a2, a3, -1
+; SINK-NEXT:    andi a4, a2, 1024
+; SINK-NEXT:    xori a2, a4, 1024
 ; SINK-NEXT:    mv a5, a0
-; SINK-NEXT:    mv a6, a3
+; SINK-NEXT:    mv a6, a2
 ; SINK-NEXT:    vsetvli a7, zero, e32, m1, ta, ma
 ; SINK-NEXT:  .LBB4_3: # %vector.body
 ; SINK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SINK-NEXT:    vl1re32.v v8, (a5)
+; SINK-NEXT:    sub a6, a6, a3
 ; SINK-NEXT:    vfadd.vf v8, v8, fa0
 ; SINK-NEXT:    vs1r.v v8, (a5)
-; SINK-NEXT:    sub a6, a6, a2
 ; SINK-NEXT:    add a5, a5, a1
 ; SINK-NEXT:    bnez a6, .LBB4_3
 ; SINK-NEXT:  # %bb.4: # %middle.block
 ; SINK-NEXT:    beqz a4, .LBB4_7
 ; SINK-NEXT:  .LBB4_5: # %for.body.preheader
-; SINK-NEXT:    slli a1, a3, 2
-; SINK-NEXT:    add a1, a0, a1
+; SINK-NEXT:    slli a1, a2, 2
 ; SINK-NEXT:    lui a2, 1
+; SINK-NEXT:    add a1, a0, a1
 ; SINK-NEXT:    add a0, a0, a2
 ; SINK-NEXT:  .LBB4_6: # %for.body
 ; SINK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -490,33 +490,33 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) {
 ; DEFAULT-LABEL: sink_splat_fadd_scalable:
 ; DEFAULT:       # %bb.0: # %entry
 ; DEFAULT-NEXT:    csrr a1, vlenb
-; DEFAULT-NEXT:    srli a2, a1, 2
-; DEFAULT-NEXT:    li a3, 1024
-; DEFAULT-NEXT:    bgeu a3, a2, .LBB4_2
+; DEFAULT-NEXT:    srli a3, a1, 2
+; DEFAULT-NEXT:    li a2, 1024
+; DEFAULT-NEXT:    bgeu a2, a3, .LBB4_2
 ; DEFAULT-NEXT:  # %bb.1:
-; DEFAULT-NEXT:    li a3, 0
+; DEFAULT-NEXT:    li a2, 0
 ; DEFAULT-NEXT:    j .LBB4_5
 ; DEFAULT-NEXT:  .LBB4_2: # %vector.ph
-; DEFAULT-NEXT:    addi a3, a2, -1
-; DEFAULT-NEXT:    andi a4, a3, 1024
-; DEFAULT-NEXT:    xori a3, a4, 1024
+; DEFAULT-NEXT:    addi a2, a3, -1
+; DEFAULT-NEXT:    andi a4, a2, 1024
+; DEFAULT-NEXT:    xori a2, a4, 1024
 ; DEFAULT-NEXT:    mv a5, a0
-; DEFAULT-NEXT:    mv a6, a3
+; DEFAULT-NEXT:    mv a6, a2
 ; DEFAULT-NEXT:    vsetvli a7, zero, e32, m1, ta, ma
 ; DEFAULT-NEXT:  .LBB4_3: # %vector.body
 ; DEFAULT-NEXT:    # =>This Inner Loop Header: Depth=1
 ; DEFAULT-NEXT:    vl1re32.v v8, (a5)
+; DEFAULT-NEXT:    sub a6, a6, a3
 ; DEFAULT-NEXT:    vfadd.vf v8, v8, fa0
 ; DEFAULT-NEXT:    vs1r.v v8, (a5)
-; DEFAULT-NEXT:    sub a6, a6, a2
 ; DEFAULT-NEXT:    add a5, a5, a1
 ; DEFAULT-NEXT:    bnez a6, .LBB4_3
 ; DEFAULT-NEXT:  # %bb.4: # %middle.block
 ; DEFAULT-NEXT:    beqz a4, .LBB4_7
 ; DEFAULT-NEXT:  .LBB4_5: # %for.body.preheader
-; DEFAULT-NEXT:    slli a1, a3, 2
-; DEFAULT-NEXT:    add a1, a0, a1
+; DEFAULT-NEXT:    slli a1, a2, 2
 ; DEFAULT-NEXT:    lui a2, 1
+; DEFAULT-NEXT:    add a1, a0, a1
 ; DEFAULT-NEXT:    add a0, a0, a2
 ; DEFAULT-NEXT:  .LBB4_6: # %for.body
 ; DEFAULT-NEXT:    # =>This Inner Loop Header: Depth=1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
index 92b88054a1d3bc..5b82b27a51510b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll
@@ -18,24 +18,24 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV32-NEXT:    lw a2, 12(a2)
 ; RV32-NEXT:    snez t2, a3
 ; RV32-NEXT:    sltiu t3, a3, 3
-; RV32-NEXT:    xori t3, t3, 1
 ; RV32-NEXT:    sltiu t4, a3, 4
-; RV32-NEXT:    xori t4, t4, 1
 ; RV32-NEXT:    sltiu a3, a3, 2
+; RV32-NEXT:    xori t3, t3, 1
+; RV32-NEXT:    xori t4, t4, 1
 ; RV32-NEXT:    xori a3, a3, 1
 ; RV32-NEXT:    and a3, a3, t0
 ; RV32-NEXT:    and a2, t4, a2
 ; RV32-NEXT:    and t0, t3, t1
 ; RV32-NEXT:    and a7, t2, a7
 ; RV32-NEXT:    neg a7, a7
-; RV32-NEXT:    and a4, a7, a4
-; RV32-NEXT:    neg a7, t0
-; RV32-NEXT:    and a6, a7, a6
+; RV32-NEXT:    neg t0, t0
 ; RV32-NEXT:    neg a2, a2
+; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a4, a7, a4
+; RV32-NEXT:    and a6, t0, a6
 ; RV32-NEXT:    and a1, a2, a1
-; RV32-NEXT:    neg a2, a3
-; RV32-NEXT:    and a2, a2, a5
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    and a3, a3, a5
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a4, a4, a6
 ; RV32-NEXT:    add a1, a4, a1
 ; RV32-NEXT:    add a0, a1, a0
@@ -54,24 +54,24 @@ define i32 @vpreduce_add_v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) {
 ; RV64-NEXT:    sext.w a3, a3
 ; RV64-NEXT:    snez t2, a3
 ; RV64-NEXT:    sltiu t3, a3, 3
-; RV64-NEXT:    xori t3, t3, 1
 ; RV64-NEXT:    sltiu t4, a3, 4
-; RV64-NEXT:    xori t4, t4, 1
 ; RV64-NEXT:    sltiu a3, a3, 2
+; RV64-NEXT:    xori t3, t3, 1
+; RV64-NEXT:    xori t4, t4, 1
 ; RV64-NEXT:    xori a3, a3, 1
 ; RV64-NEXT:    and a3, a3, t0
 ; RV64-NEXT:    and a2, t4, a2
 ; RV64-NEXT:    and t0, t3, t1
 ; RV64-NEXT:    and a7, t2, a7
 ; RV64-NEXT:    negw a7, a7
-; RV64-NEXT:    and a4, a7, a4
-; RV64-NEXT:    negw a7, t0
-; RV64-NEXT:    and a6, a7, a6
+; RV64-NEXT:    negw t0, t0
 ; RV64-NEXT:    negw a2, a2
+; RV64-NEXT:    negw a3, a3
+; RV64-NEXT:    and a4, a7, a4
+; RV64-NEXT:    and a6, t0, a6
 ; RV64-NEXT:    and a1, a2, a1
-; RV64-NEXT:    negw a2, a3
-; RV64-NEXT:    and a2, a2, a5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    and a3, a3, a5
+; RV64-NEXT:    add a1, a3, a1
 ; RV64-NEXT:    add a4, a4, a6
 ; RV64-NEXT:    add a1, a4, a1
 ; RV64-NEXT:    addw a0, a1, a0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/expandload.ll b/llvm/test/CodeGen/RISCV/rvv/expandload.ll
index fac033e982e10e..f1fcaed2762ae7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/expandload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/expandload.ll
@@ -229,38 +229,40 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
 ; CHECK-RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vmv1r.v v7, v8
 ; CHECK-RV32-NEXT:    li a2, 128
+; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-RV32-NEXT:    vslidedown.vi v9, v0, 1
+; CHECK-RV32-NEXT:    li a3, 32
+; CHECK-RV32-NEXT:    vmv.x.s a4, v0
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-RV32-NEXT:    vle8.v v8, (a1)
+; CHECK-RV32-NEXT:    vle8.v v16, (a1)
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
 ; CHECK-RV32-NEXT:    slli a1, a1, 3
 ; CHECK-RV32-NEXT:    add a1, sp, a1
 ; CHECK-RV32-NEXT:    addi a1, a1, 16
-; CHECK-RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vi v9, v0, 1
-; CHECK-RV32-NEXT:    li a1, 32
-; CHECK-RV32-NEXT:    vsrl.vx v10, v9, a1
-; CHECK-RV32-NEXT:    vmv.x.s a3, v10
-; CHECK-RV32-NEXT:    vsrl.vx v10, v0, a1
-; CHECK-RV32-NEXT:    vmv.x.s a1, v10
-; CHECK-RV32-NEXT:    vmv.x.s a4, v9
-; CHECK-RV32-NEXT:    vmv.x.s a5, v0
+; CHECK-RV32-NEXT:    vsrl.vx v10, v9, a3
+; CHECK-RV32-NEXT:    vsrl.vx v11, v0, a3
+; CHECK-RV32-NEXT:    vmv.x.s a1, v9
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-RV32-NEXT:    vcpop.m a6, v0
-; CHECK-RV32-NEXT:    vsetvli zero, a6, e8, m8, ta, ma
+; CHECK-RV32-NEXT:    vcpop.m a3, v0
+; CHECK-RV32-NEXT:    cpop a4, a4
+; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.x.s a5, v10
+; CHECK-RV32-NEXT:    vmv.x.s a6, v11
+; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-RV32-NEXT:    vle8.v v8, (a0)
-; CHECK-RV32-NEXT:    csrr a6, vlenb
-; CHECK-RV32-NEXT:    slli a6, a6, 4
-; CHECK-RV32-NEXT:    add a6, sp, a6
-; CHECK-RV32-NEXT:    addi a6, a6, 16
-; CHECK-RV32-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    csrr a3, vlenb
+; CHECK-RV32-NEXT:    slli a3, a3, 4
+; CHECK-RV32-NEXT:    add a3, sp, a3
+; CHECK-RV32-NEXT:    addi a3, a3, 16
+; CHECK-RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    cpop a1, a1
+; CHECK-RV32-NEXT:    cpop a3, a6
 ; CHECK-RV32-NEXT:    cpop a5, a5
-; CHECK-RV32-NEXT:    add a1, a5, a1
-; CHECK-RV32-NEXT:    cpop a3, a3
-; CHECK-RV32-NEXT:    cpop a4, a4
 ; CHECK-RV32-NEXT:    add a3, a4, a3
-; CHECK-RV32-NEXT:    add a1, a1, a3
+; CHECK-RV32-NEXT:    add a1, a1, a5
+; CHECK-RV32-NEXT:    add a1, a3, a1
 ; CHECK-RV32-NEXT:    add a0, a0, a1
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-RV32-NEXT:    vcpop.m a1, v7
@@ -269,19 +271,19 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, mu
-; CHECK-RV32-NEXT:    viota.m v16, v0
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 4
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-RV32-NEXT:    viota.m v24, v0
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    li a1, 24
 ; CHECK-RV32-NEXT:    mul a0, a0, a1
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    vrgather.vv v8, v24, v16, v0.t
+; CHECK-RV32-NEXT:    csrr a0, vlenb
+; CHECK-RV32-NEXT:    slli a0, a0, 4
+; CHECK-RV32-NEXT:    add a0, sp, a0
+; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    li a1, 24
 ; CHECK-RV32-NEXT:    mul a0, a0, a1
@@ -338,23 +340,27 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
 ; CHECK-RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vmv1r.v v7, v8
 ; CHECK-RV64-NEXT:    li a2, 128
-; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-RV64-NEXT:    vle8.v v8, (a1)
-; CHECK-RV64-NEXT:    addi a1, sp, 16
-; CHECK-RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV64-NEXT:    vslidedown.vi v9, v0, 1
-; CHECK-RV64-NEXT:    vmv.x.s a1, v9
 ; CHECK-RV64-NEXT:    vmv.x.s a3, v0
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-RV64-NEXT:    vle8.v v16, (a1)
+; CHECK-RV64-NEXT:    csrr a1, vlenb
+; CHECK-RV64-NEXT:    slli a1, a1, 3
+; CHECK-RV64-NEXT:    add a1, sp, a1
+; CHECK-RV64-NEXT:    addi a1, a1, 16
+; CHECK-RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.x.s a1, v9
+; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-RV64-NEXT:    vcpop.m a4, v0
 ; CHECK-RV64-NEXT:    vsetvli zero, a4, e8, m8, ta, ma
-; CHECK-RV64-NEXT:    vle8.v v24, (a0)
+; CHECK-RV64-NEXT:    vle8.v v8, (a0)
 ; CHECK-RV64-NEXT:    csrr a4, vlenb
 ; CHECK-RV64-NEXT:    slli a4, a4, 4
 ; CHECK-RV64-NEXT:    add a4, sp, a4
 ; CHECK-RV64-NEXT:    addi a4, a4, 16
-; CHECK-RV64-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-RV64-NEXT:    vcpop.m a4, v7
 ; CHECK-RV64-NEXT:    cpop a3, a3
@@ -363,25 +369,22 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
 ; CHECK-RV64-NEXT:    add a0, a0, a1
 ; CHECK-RV64-NEXT:    vsetvli zero, a4, e8, m8, ta, ma
 ; CHECK-RV64-NEXT:    vle8.v v8, (a0)
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 3
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
+; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, mu
-; CHECK-RV64-NEXT:    viota.m v16, v0
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 4
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT:    viota.m v24, v0
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    li a1, 24
 ; CHECK-RV64-NEXT:    mul a0, a0, a1
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    vrgather.vv v8, v24, v16, v0.t
+; CHECK-RV64-NEXT:    csrr a0, vlenb
+; CHECK-RV64-NEXT:    slli a0, a0, 4
+; CHECK-RV64-NEXT:    add a0, sp, a0
+; CHECK-RV64-NEXT:    addi a0, a0, 16
+; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    li a1, 24
 ; CHECK-RV64-NEXT:    mul a0, a0, a1
@@ -399,16 +402,15 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
 ; CHECK-RV64-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
+; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    vrgather.vv v8, v24, v16, v0.t
-; CHECK-RV64-NEXT:    vmv.v.v v16, v8
+; CHECK-RV64-NEXT:    vrgather.vv v16, v24, v8, v0.t
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    li a1, 24
 ; CHECK-RV64-NEXT:    mul a0, a0, a1
@@ -430,23 +432,23 @@ define <256 x i8> @test_expandload_v256i8_all_ones(ptr %base, <256 x i8> %passth
 ; CHECK-RV32-LABEL: test_expandload_v256i8_all_ones:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    li a1, 128
+; CHECK-RV32-NEXT:    li a2, 32
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-RV32-NEXT:    vmset.m v8
-; CHECK-RV32-NEXT:    li a2, 32
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vsrl.vx v9, v8, a2
-; CHECK-RV32-NEXT:    vmv.x.s a3, v9
-; CHECK-RV32-NEXT:    cpop a3, a3
-; CHECK-RV32-NEXT:    vmv.x.s a4, v8
-; CHECK-RV32-NEXT:    cpop a4, a4
-; CHECK-RV32-NEXT:    add a3, a4, a3
+; CHECK-RV32-NEXT:    vmv.x.s a3, v8
 ; CHECK-RV32-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-RV32-NEXT:    vmv.x.s a4, v9
+; CHECK-RV32-NEXT:    cpop a3, a3
 ; CHECK-RV32-NEXT:    vsrl.vx v9, v8, a2
-; CHECK-RV32-NEXT:    vmv.x.s a2, v9
+; CHECK-RV32-NEXT:    vmv.x.s a2, v8
+; CHECK-RV32-NEXT:    cpop a4, a4
+; CHECK-RV32-NEXT:    add a3, a3, a4
+; CHECK-RV32-NEXT:    vmv.x.s a4, v9
 ; CHECK-RV32-NEXT:    cpop a2, a2
-; CHECK-RV32-NEXT:    vmv.x.s a4, v8
 ; CHECK-RV32-NEXT:    cpop a4, a4
-; CHECK-RV32-NEXT:    add a2, a4, a2
+; CHECK-RV32-NEXT:    add a2, a2, a4
 ; CHECK-RV32-NEXT:    add a3, a0, a3
 ; CHECK-RV32-NEXT:    add a2, a3, a2
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
@@ -462,8 +464,8 @@ define <256 x i8> @test_expandload_v256i8_all_ones(ptr %base, <256 x i8> %passth
 ; CHECK-RV64-NEXT:    vmset.m v16
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
 ; CHECK-RV64-NEXT:    vmv.x.s a2, v16
-; CHECK-RV64-NEXT:    cpop a2, a2
 ; CHECK-RV64-NEXT:    vslidedown.vi v16, v16, 1
+; CHECK-RV64-NEXT:    cpop a2, a2
 ; CHECK-RV64-NEXT:    vmv.x.s a3, v16
 ; CHECK-RV64-NEXT:    cpop a3, a3
 ; CHECK-RV64-NEXT:    add a0, a0, a2
@@ -662,107 +664,76 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x
 ; CHECK-RV32-NEXT:    addi sp, sp, -16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
-; CHECK-RV32-NEXT:    li a2, 40
-; CHECK-RV32-NEXT:    mul a1, a1, a2
+; CHECK-RV32-NEXT:    slli a1, a1, 5
 ; CHECK-RV32-NEXT:    sub sp, sp, a1
-; CHECK-RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; CHECK-RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
 ; CHECK-RV32-NEXT:    li a2, 24
 ; CHECK-RV32-NEXT:    mul a1, a1, a2
 ; CHECK-RV32-NEXT:    add a1, sp, a1
 ; CHECK-RV32-NEXT:    addi a1, a1, 16
 ; CHECK-RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT:    csrr a1, vlenb
-; CHECK-RV32-NEXT:    slli a1, a1, 5
-; CHECK-RV32-NEXT:    add a1, sp, a1
-; CHECK-RV32-NEXT:    addi a1, a1, 16
-; CHECK-RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    li a1, 64
-; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-RV32-NEXT:    vcpop.m a2, v0
-; CHECK-RV32-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-RV32-NEXT:    vle16.v v8, (a0)
-; CHECK-RV32-NEXT:    csrr a2, vlenb
-; CHECK-RV32-NEXT:    slli a2, a2, 4
-; CHECK-RV32-NEXT:    add a2, sp, a2
-; CHECK-RV32-NEXT:    addi a2, a2, 16
-; CHECK-RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-RV32-NEXT:    vslidedown.vi v7, v0, 8
+; CHECK-RV32-NEXT:    li a2, 32
+; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vmv.x.s a3, v0
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-RV32-NEXT:    vcpop.m a2, v7
-; CHECK-RV32-NEXT:    li a3, 32
+; CHECK-RV32-NEXT:    vcpop.m a4, v0
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsrl.vx v25, v0, a3
-; CHECK-RV32-NEXT:    vmv.x.s a3, v25
-; CHECK-RV32-NEXT:    cpop a3, a3
-; CHECK-RV32-NEXT:    vmv.x.s a4, v0
+; CHECK-RV32-NEXT:    vsrl.vx v25, v0, a2
+; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-RV32-NEXT:    vcpop.m a2, v7
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e16, m8, ta, ma
+; CHECK-RV32-NEXT:    vle16.v v16, (a0)
+; CHECK-RV32-NEXT:    csrr a5, vlenb
+; CHECK-RV32-NEXT:    slli a5, a5, 4
+; CHECK-RV32-NEXT:    add a5, sp, a5
+; CHECK-RV32-NEXT:    addi a5, a5, 16
+; CHECK-RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e64, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.x.s a4, v25
 ; CHECK-RV32-NEXT:    cpop a4, a4
-; CHECK-RV32-NEXT:    add a3, a4, a3
+; CHECK-RV32-NEXT:    cpop a3, a3
+; CHECK-RV32-NEXT:    add a3, a3, a4
 ; CHECK-RV32-NEXT:    slli a3, a3, 1
 ; CHECK-RV32-NEXT:    add a0, a0, a3
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-RV32-NEXT:    vle16.v v8, (a0)
+; CHECK-RV32-NEXT:    vle16.v v16, (a0)
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-RV32-NEXT:    viota.m v8, v0
-; CHECK-RV32-NEXT:    addi a0, sp, 16
-; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    viota.m v16, v0
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 5
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 5
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vrgather.vv v8, v24, v16, v0.t
+; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT:    viota.m v16, v7
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 4
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    viota.m v8, v7
 ; CHECK-RV32-NEXT:    vmv1r.v v0, v7
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 3
+; CHECK-RV32-NEXT:    li a1, 24
+; CHECK-RV32-NEXT:    mul a0, a0, a1
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 4
+; CHECK-RV32-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 24
-; CHECK-RV32-NEXT:    mul a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    vrgather.vv v16, v24, v8, v0.t
+; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV32-NEXT:    vmv.v.v v16, v8
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 5
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 40
-; CHECK-RV32-NEXT:    mul a0, a0, a1
 ; CHECK-RV32-NEXT:    add sp, sp, a0
 ; CHECK-RV32-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -774,93 +745,69 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x
 ; CHECK-RV64-NEXT:    addi sp, sp, -16
 ; CHECK-RV64-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-RV64-NEXT:    csrr a1, vlenb
-; CHECK-RV64-NEXT:    li a2, 40
-; CHECK-RV64-NEXT:    mul a1, a1, a2
+; CHECK-RV64-NEXT:    slli a1, a1, 5
 ; CHECK-RV64-NEXT:    sub sp, sp, a1
-; CHECK-RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; CHECK-RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; CHECK-RV64-NEXT:    csrr a1, vlenb
-; CHECK-RV64-NEXT:    slli a1, a1, 5
+; CHECK-RV64-NEXT:    slli a1, a1, 4
 ; CHECK-RV64-NEXT:    add a1, sp, a1
 ; CHECK-RV64-NEXT:    addi a1, a1, 16
 ; CHECK-RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    li a1, 64
-; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-RV64-NEXT:    vcpop.m a2, v0
-; CHECK-RV64-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-RV64-NEXT:    vle16.v v16, (a0)
-; CHECK-RV64-NEXT:    csrr a2, vlenb
-; CHECK-RV64-NEXT:    li a3, 24
-; CHECK-RV64-NEXT:    mul a2, a2, a3
-; CHECK-RV64-NEXT:    add a2, sp, a2
-; CHECK-RV64-NEXT:    addi a2, a2, 16
-; CHECK-RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-RV64-NEXT:    vslidedown.vi v7, v0, 8
+; CHECK-RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-RV64-NEXT:    vmv.x.s a2, v0
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-RV64-NEXT:    vcpop.m a2, v7
-; CHECK-RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.x.s a3, v0
-; CHECK-RV64-NEXT:    cpop a3, a3
-; CHECK-RV64-NEXT:    slli a3, a3, 1
-; CHECK-RV64-NEXT:    add a0, a0, a3
-; CHECK-RV64-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-RV64-NEXT:    vcpop.m a3, v0
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e16, m8, ta, ma
+; CHECK-RV64-NEXT:    vle16.v v24, (a0)
+; CHECK-RV64-NEXT:    csrr a3, vlenb
+; CHECK-RV64-NEXT:    li a4, 24
+; CHECK-RV64-NEXT:    mul a3, a3, a4
+; CHECK-RV64-NEXT:    add a3, sp, a3
+; CHECK-RV64-NEXT:    addi a3, a3, 16
+; CHECK-RV64-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-RV64-NEXT:    vcpop.m a3, v7
+; CHECK-RV64-NEXT:    cpop a2, a2
+; CHECK-RV64-NEXT:    slli a2, a2, 1
+; CHECK-RV64-NEXT:    add a0, a0, a2
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e16, m8, ta, ma
 ; CHECK-RV64-NEXT:    vle16.v v16, (a0)
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 4
+; CHECK-RV64-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-RV64-NEXT:    viota.m v16, v0
-; CHECK-RV64-NEXT:    addi a0, sp, 16
-; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    viota.m v24, v0
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    li a1, 24
 ; CHECK-RV64-NEXT:    mul a0, a0, a1
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    addi a0, sp, 16
-; CHECK-RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 3
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
+; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT:    viota.m v16, v7
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 24
-; CHECK-RV64-NEXT:    mul a0, a0, a1
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    viota.m v8, v7
 ; CHECK-RV64-NEXT:    vmv1r.v v0, v7
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 5
+; CHECK-RV64-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 4
+; CHECK-RV64-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 24
-; CHECK-RV64-NEXT:    mul a0, a0, a1
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    vrgather.vv v16, v24, v8, v0.t
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 3
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
+; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 40
-; CHECK-RV64-NEXT:    mul a0, a0, a1
+; CHECK-RV64-NEXT:    slli a0, a0, 5
 ; CHECK-RV64-NEXT:    add sp, sp, a0
 ; CHECK-RV64-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -874,17 +821,17 @@ define <128 x i16> @test_expandload_v128i16_all_ones(ptr %base, <128 x i16> %pas
 ; CHECK-RV32-LABEL: test_expandload_v128i16_all_ones:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    li a1, 64
+; CHECK-RV32-NEXT:    li a2, 32
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-RV32-NEXT:    vle16.v v8, (a0)
 ; CHECK-RV32-NEXT:    vmset.m v16
-; CHECK-RV32-NEXT:    li a2, 32
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vsrl.vx v17, v16, a2
-; CHECK-RV32-NEXT:    vmv.x.s a2, v17
-; CHECK-RV32-NEXT:    cpop a2, a2
-; CHECK-RV32-NEXT:    vmv.x.s a3, v16
+; CHECK-RV32-NEXT:    vmv.x.s a2, v16
+; CHECK-RV32-NEXT:    vmv.x.s a3, v17
 ; CHECK-RV32-NEXT:    cpop a3, a3
-; CHECK-RV32-NEXT:    add a2, a3, a2
+; CHECK-RV32-NEXT:    cpop a2, a2
+; CHECK-RV32-NEXT:    add a2, a2, a3
 ; CHECK-RV32-NEXT:    slli a2, a2, 1
 ; CHECK-RV32-NEXT:    add a0, a0, a2
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
@@ -1069,92 +1016,69 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32>
 ; CHECK-RV32-NEXT:    addi sp, sp, -16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
-; CHECK-RV32-NEXT:    li a2, 40
-; CHECK-RV32-NEXT:    mul a1, a1, a2
+; CHECK-RV32-NEXT:    slli a1, a1, 5
 ; CHECK-RV32-NEXT:    sub sp, sp, a1
-; CHECK-RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; CHECK-RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
-; CHECK-RV32-NEXT:    slli a1, a1, 5
+; CHECK-RV32-NEXT:    slli a1, a1, 4
 ; CHECK-RV32-NEXT:    add a1, sp, a1
 ; CHECK-RV32-NEXT:    addi a1, a1, 16
 ; CHECK-RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    li a1, 32
-; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-RV32-NEXT:    vcpop.m a2, v0
-; CHECK-RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-RV32-NEXT:    vle32.v v16, (a0)
-; CHECK-RV32-NEXT:    csrr a2, vlenb
-; CHECK-RV32-NEXT:    li a3, 24
-; CHECK-RV32-NEXT:    mul a2, a2, a3
-; CHECK-RV32-NEXT:    add a2, sp, a2
-; CHECK-RV32-NEXT:    addi a2, a2, 16
-; CHECK-RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-RV32-NEXT:    vslidedown.vi v7, v0, 4
-; CHECK-RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-RV32-NEXT:    vcpop.m a2, v7
-; CHECK-RV32-NEXT:    vmv.x.s a3, v0
-; CHECK-RV32-NEXT:    cpop a3, a3
-; CHECK-RV32-NEXT:    slli a3, a3, 2
-; CHECK-RV32-NEXT:    add a0, a0, a3
-; CHECK-RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-RV32-NEXT:    vmv.x.s a2, v0
+; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32-NEXT:    vcpop.m a3, v0
+; CHECK-RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-RV32-NEXT:    vle32.v v24, (a0)
+; CHECK-RV32-NEXT:    csrr a3, vlenb
+; CHECK-RV32-NEXT:    li a4, 24
+; CHECK-RV32-NEXT:    mul a3, a3, a4
+; CHECK-RV32-NEXT:    add a3, sp, a3
+; CHECK-RV32-NEXT:    addi a3, a3, 16
+; CHECK-RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV32-NEXT:    vcpop.m a3, v7
+; CHECK-RV32-NEXT:    cpop a2, a2
+; CHECK-RV32-NEXT:    slli a2, a2, 2
+; CHECK-RV32-NEXT:    add a0, a0, a2
+; CHECK-RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-RV32-NEXT:    vle32.v v16, (a0)
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 4
+; CHECK-RV32-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-RV32-NEXT:    viota.m v16, v0
-; CHECK-RV32-NEXT:    addi a0, sp, 16
-; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    viota.m v24, v0
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    li a1, 24
 ; CHECK-RV32-NEXT:    mul a0, a0, a1
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    addi a0, sp, 16
-; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 3
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT:    viota.m v16, v7
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 24
-; CHECK-RV32-NEXT:    mul a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    viota.m v8, v7
 ; CHECK-RV32-NEXT:    vmv1r.v v0, v7
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 5
+; CHECK-RV32-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 4
+; CHECK-RV32-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 24
-; CHECK-RV32-NEXT:    mul a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    vrgather.vv v16, v24, v8, v0.t
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 3
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 40
-; CHECK-RV32-NEXT:    mul a0, a0, a1
+; CHECK-RV32-NEXT:    slli a0, a0, 5
 ; CHECK-RV32-NEXT:    add sp, sp, a0
 ; CHECK-RV32-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -1166,92 +1090,69 @@ define <64 x i32> @test_expandload_v64i32(ptr %base, <64 x i1> %mask, <64 x i32>
 ; CHECK-RV64-NEXT:    addi sp, sp, -16
 ; CHECK-RV64-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-RV64-NEXT:    csrr a1, vlenb
-; CHECK-RV64-NEXT:    li a2, 40
-; CHECK-RV64-NEXT:    mul a1, a1, a2
+; CHECK-RV64-NEXT:    slli a1, a1, 5
 ; CHECK-RV64-NEXT:    sub sp, sp, a1
-; CHECK-RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; CHECK-RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; CHECK-RV64-NEXT:    csrr a1, vlenb
-; CHECK-RV64-NEXT:    slli a1, a1, 5
+; CHECK-RV64-NEXT:    slli a1, a1, 4
 ; CHECK-RV64-NEXT:    add a1, sp, a1
 ; CHECK-RV64-NEXT:    addi a1, a1, 16
 ; CHECK-RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    li a1, 32
-; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-RV64-NEXT:    vcpop.m a2, v0
-; CHECK-RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-RV64-NEXT:    vle32.v v16, (a0)
-; CHECK-RV64-NEXT:    csrr a2, vlenb
-; CHECK-RV64-NEXT:    li a3, 24
-; CHECK-RV64-NEXT:    mul a2, a2, a3
-; CHECK-RV64-NEXT:    add a2, sp, a2
-; CHECK-RV64-NEXT:    addi a2, a2, 16
-; CHECK-RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-RV64-NEXT:    vslidedown.vi v7, v0, 4
-; CHECK-RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-RV64-NEXT:    vcpop.m a2, v7
-; CHECK-RV64-NEXT:    vmv.x.s a3, v0
-; CHECK-RV64-NEXT:    cpopw a3, a3
-; CHECK-RV64-NEXT:    slli a3, a3, 2
-; CHECK-RV64-NEXT:    add a0, a0, a3
-; CHECK-RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-RV64-NEXT:    vmv.x.s a2, v0
+; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV64-NEXT:    vcpop.m a3, v0
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-RV64-NEXT:    vle32.v v24, (a0)
+; CHECK-RV64-NEXT:    csrr a3, vlenb
+; CHECK-RV64-NEXT:    li a4, 24
+; CHECK-RV64-NEXT:    mul a3, a3, a4
+; CHECK-RV64-NEXT:    add a3, sp, a3
+; CHECK-RV64-NEXT:    addi a3, a3, 16
+; CHECK-RV64-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-RV64-NEXT:    vcpop.m a3, v7
+; CHECK-RV64-NEXT:    cpopw a2, a2
+; CHECK-RV64-NEXT:    slli a2, a2, 2
+; CHECK-RV64-NEXT:    add a0, a0, a2
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-RV64-NEXT:    vle32.v v16, (a0)
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 4
+; CHECK-RV64-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-RV64-NEXT:    viota.m v16, v0
-; CHECK-RV64-NEXT:    addi a0, sp, 16
-; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    viota.m v24, v0
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    li a1, 24
 ; CHECK-RV64-NEXT:    mul a0, a0, a1
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    addi a0, sp, 16
-; CHECK-RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 3
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
+; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT:    viota.m v16, v7
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 24
-; CHECK-RV64-NEXT:    mul a0, a0, a1
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    viota.m v8, v7
 ; CHECK-RV64-NEXT:    vmv1r.v v0, v7
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 5
+; CHECK-RV64-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 4
+; CHECK-RV64-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 24
-; CHECK-RV64-NEXT:    mul a0, a0, a1
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    vrgather.vv v16, v24, v8, v0.t
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 3
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
+; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 40
-; CHECK-RV64-NEXT:    mul a0, a0, a1
+; CHECK-RV64-NEXT:    slli a0, a0, 5
 ; CHECK-RV64-NEXT:    add sp, sp, a0
 ; CHECK-RV64-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -1420,92 +1321,68 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
 ; CHECK-RV32-NEXT:    addi sp, sp, -16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
-; CHECK-RV32-NEXT:    li a2, 40
-; CHECK-RV32-NEXT:    mul a1, a1, a2
+; CHECK-RV32-NEXT:    slli a1, a1, 5
 ; CHECK-RV32-NEXT:    sub sp, sp, a1
-; CHECK-RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; CHECK-RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
-; CHECK-RV32-NEXT:    slli a1, a1, 5
+; CHECK-RV32-NEXT:    slli a1, a1, 4
 ; CHECK-RV32-NEXT:    add a1, sp, a1
 ; CHECK-RV32-NEXT:    addi a1, a1, 16
 ; CHECK-RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-RV32-NEXT:    vcpop.m a1, v0
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-RV32-NEXT:    vle64.v v16, (a0)
+; CHECK-RV32-NEXT:    vle64.v v24, (a0)
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
 ; CHECK-RV32-NEXT:    li a2, 24
 ; CHECK-RV32-NEXT:    mul a1, a1, a2
 ; CHECK-RV32-NEXT:    add a1, sp, a1
 ; CHECK-RV32-NEXT:    addi a1, a1, 16
-; CHECK-RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vmv.x.s a1, v0
+; CHECK-RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-RV32-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-RV32-NEXT:    zext.h a1, a1
 ; CHECK-RV32-NEXT:    cpop a1, a1
 ; CHECK-RV32-NEXT:    slli a1, a1, 3
 ; CHECK-RV32-NEXT:    add a0, a0, a1
-; CHECK-RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-RV32-NEXT:    vcpop.m a1, v7
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vle64.v v16, (a0)
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 4
+; CHECK-RV32-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
-; CHECK-RV32-NEXT:    viota.m v16, v0
-; CHECK-RV32-NEXT:    addi a0, sp, 16
-; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    viota.m v24, v0
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    li a1, 24
 ; CHECK-RV32-NEXT:    mul a0, a0, a1
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    addi a0, sp, 16
-; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 3
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT:    viota.m v16, v7
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 24
-; CHECK-RV32-NEXT:    mul a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    viota.m v8, v7
 ; CHECK-RV32-NEXT:    vmv1r.v v0, v7
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 5
+; CHECK-RV32-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 4
+; CHECK-RV32-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 24
-; CHECK-RV32-NEXT:    mul a0, a0, a1
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    vrgather.vv v16, v24, v8, v0.t
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 3
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
+; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 40
-; CHECK-RV32-NEXT:    mul a0, a0, a1
+; CHECK-RV32-NEXT:    slli a0, a0, 5
 ; CHECK-RV32-NEXT:    add sp, sp, a0
 ; CHECK-RV32-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
@@ -1517,92 +1394,68 @@ define <32 x i64> @test_expandload_v32i64(ptr %base, <32 x i1> %mask, <32 x i64>
 ; CHECK-RV64-NEXT:    addi sp, sp, -16
 ; CHECK-RV64-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-RV64-NEXT:    csrr a1, vlenb
-; CHECK-RV64-NEXT:    li a2, 40
-; CHECK-RV64-NEXT:    mul a1, a1, a2
+; CHECK-RV64-NEXT:    slli a1, a1, 5
 ; CHECK-RV64-NEXT:    sub sp, sp, a1
-; CHECK-RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; CHECK-RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; CHECK-RV64-NEXT:    csrr a1, vlenb
-; CHECK-RV64-NEXT:    slli a1, a1, 5
+; CHECK-RV64-NEXT:    slli a1, a1, 4
 ; CHECK-RV64-NEXT:    add a1, sp, a1
 ; CHECK-RV64-NEXT:    addi a1, a1, 16
 ; CHECK-RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-RV64-NEXT:    vcpop.m a1, v0
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
-; CHECK-RV64-NEXT:    vle64.v v16, (a0)
+; CHECK-RV64-NEXT:    vle64.v v24, (a0)
 ; CHECK-RV64-NEXT:    csrr a1, vlenb
 ; CHECK-RV64-NEXT:    li a2, 24
 ; CHECK-RV64-NEXT:    mul a1, a1, a2
 ; CHECK-RV64-NEXT:    add a1, sp, a1
 ; CHECK-RV64-NEXT:    addi a1, a1, 16
-; CHECK-RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vmv.x.s a1, v0
+; CHECK-RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-RV64-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-RV64-NEXT:    zext.h a1, a1
 ; CHECK-RV64-NEXT:    cpopw a1, a1
 ; CHECK-RV64-NEXT:    slli a1, a1, 3
 ; CHECK-RV64-NEXT:    add a0, a0, a1
-; CHECK-RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-RV64-NEXT:    vcpop.m a1, v7
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vle64.v v16, (a0)
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 4
+; CHECK-RV64-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
-; CHECK-RV64-NEXT:    viota.m v16, v0
-; CHECK-RV64-NEXT:    addi a0, sp, 16
-; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    viota.m v24, v0
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    li a1, 24
 ; CHECK-RV64-NEXT:    mul a0, a0, a1
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    addi a0, sp, 16
-; CHECK-RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    vrgather.vv v8, v16, v24, v0.t
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 3
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
+; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT:    viota.m v16, v7
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 24
-; CHECK-RV64-NEXT:    mul a0, a0, a1
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    viota.m v8, v7
 ; CHECK-RV64-NEXT:    vmv1r.v v0, v7
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 5
+; CHECK-RV64-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 4
+; CHECK-RV64-NEXT:    slli a0, a0, 3
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
 ; CHECK-RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 24
-; CHECK-RV64-NEXT:    mul a0, a0, a1
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    vrgather.vv v16, v24, v8, v0.t
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 3
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
+; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 40
-; CHECK-RV64-NEXT:    mul a0, a0, a1
+; CHECK-RV64-NEXT:    slli a0, a0, 5
 ; CHECK-RV64-NEXT:    add sp, sp, a0
 ; CHECK-RV64-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-RV64-NEXT:    addi sp, sp, 16
@@ -1765,26 +1618,26 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    bgez a1, .LBB61_30
 ; CHECK-RV32-NEXT:  .LBB61_29: # %cond.load109
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 29, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 28
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 28
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_30: # %else110
 ; CHECK-RV32-NEXT:    slli a2, a3, 2
 ; CHECK-RV32-NEXT:    li a1, 32
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_32
 ; CHECK-RV32-NEXT:  # %bb.31: # %cond.load113
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 30, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a2
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 29
+; CHECK-RV32-NEXT:    vmv.s.x v9, a2
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 29
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_32: # %else114
 ; CHECK-RV32-NEXT:    slli a2, a3, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -1792,10 +1645,10 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_34
 ; CHECK-RV32-NEXT:  # %bb.33: # %cond.load117
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v17, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v9, a2
 ; CHECK-RV32-NEXT:    vsetivli zero, 31, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vi v8, v17, 30
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 30
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -1926,13 +1779,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_65: # %cond.load241
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 62
 ; CHECK-RV32-NEXT:    li a4, 61
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -1943,12 +1796,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_68
 ; CHECK-RV32-NEXT:  # %bb.67: # %cond.load245
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v17, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 63
 ; CHECK-RV32-NEXT:    li a4, 62
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v17, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -2079,13 +1932,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_99: # %cond.load369
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 94
 ; CHECK-RV32-NEXT:    li a4, 93
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -2096,12 +1949,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_102
 ; CHECK-RV32-NEXT:  # %bb.101: # %cond.load373
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 95
 ; CHECK-RV32-NEXT:    li a4, 94
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -2232,13 +2085,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_133: # %cond.load497
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 126
 ; CHECK-RV32-NEXT:    li a4, 125
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -2249,12 +2102,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_136
 ; CHECK-RV32-NEXT:  # %bb.135: # %cond.load501
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v18, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 127
 ; CHECK-RV32-NEXT:    li a4, 126
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -2385,13 +2238,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_167: # %cond.load625
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 158
 ; CHECK-RV32-NEXT:    li a4, 157
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -2402,12 +2255,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_170
 ; CHECK-RV32-NEXT:  # %bb.169: # %cond.load629
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 159
 ; CHECK-RV32-NEXT:    li a4, 158
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -2538,16 +2391,16 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_201: # %cond.load753
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 190
 ; CHECK-RV32-NEXT:    li a4, 189
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_202: # %else754
 ; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -2555,12 +2408,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_204
 ; CHECK-RV32-NEXT:  # %bb.203: # %cond.load757
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v20, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 191
 ; CHECK-RV32-NEXT:    li a4, 190
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -2691,13 +2544,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_235: # %cond.load881
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 222
 ; CHECK-RV32-NEXT:    li a4, 221
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -2708,12 +2561,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    bgez a2, .LBB61_238
 ; CHECK-RV32-NEXT:  # %bb.237: # %cond.load885
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 223
 ; CHECK-RV32-NEXT:    li a4, 222
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -2844,16 +2697,16 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_269: # %cond.load1009
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 254
 ; CHECK-RV32-NEXT:    li a4, 253
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:  .LBB61_270: # %else1010
 ; CHECK-RV32-NEXT:    slli a3, a2, 1
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -2861,12 +2714,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_272
 ; CHECK-RV32-NEXT:  # %bb.271: # %cond.load1013
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v20, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 255
 ; CHECK-RV32-NEXT:    li a4, 254
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4046,326 +3899,326 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_2
 ; CHECK-RV32-NEXT:  .LBB61_545: # %cond.load1
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV32-NEXT:    vsetivli zero, 2, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 4
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_546
 ; CHECK-RV32-NEXT:    j .LBB61_3
 ; CHECK-RV32-NEXT:  .LBB61_546: # %cond.load5
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 2
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 8
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_547
 ; CHECK-RV32-NEXT:    j .LBB61_4
 ; CHECK-RV32-NEXT:  .LBB61_547: # %cond.load9
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 4, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 3
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 3
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 16
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_548
 ; CHECK-RV32-NEXT:    j .LBB61_5
 ; CHECK-RV32-NEXT:  .LBB61_548: # %cond.load13
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 5, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 4
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 32
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_549
 ; CHECK-RV32-NEXT:    j .LBB61_6
 ; CHECK-RV32-NEXT:  .LBB61_549: # %cond.load17
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 6, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 5
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 5
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 64
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_550
 ; CHECK-RV32-NEXT:    j .LBB61_7
 ; CHECK-RV32-NEXT:  .LBB61_550: # %cond.load21
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 7, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 6
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 6
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 128
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_551
 ; CHECK-RV32-NEXT:    j .LBB61_8
 ; CHECK-RV32-NEXT:  .LBB61_551: # %cond.load25
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 8, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 7
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 7
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 256
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_552
 ; CHECK-RV32-NEXT:    j .LBB61_9
 ; CHECK-RV32-NEXT:  .LBB61_552: # %cond.load29
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 9, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 8
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 8
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 512
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_553
 ; CHECK-RV32-NEXT:    j .LBB61_10
 ; CHECK-RV32-NEXT:  .LBB61_553: # %cond.load33
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 10, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 9
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 9
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a1, a3, 1024
 ; CHECK-RV32-NEXT:    bnez a1, .LBB61_554
 ; CHECK-RV32-NEXT:    j .LBB61_11
 ; CHECK-RV32-NEXT:  .LBB61_554: # %cond.load37
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 11, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 10
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 10
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 20
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_555
 ; CHECK-RV32-NEXT:    j .LBB61_12
 ; CHECK-RV32-NEXT:  .LBB61_555: # %cond.load41
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 12, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 11
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 11
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 19
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_556
 ; CHECK-RV32-NEXT:    j .LBB61_13
 ; CHECK-RV32-NEXT:  .LBB61_556: # %cond.load45
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 13, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 12
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 12
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 18
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_557
 ; CHECK-RV32-NEXT:    j .LBB61_14
 ; CHECK-RV32-NEXT:  .LBB61_557: # %cond.load49
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 14, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 13
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 13
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 17
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_558
 ; CHECK-RV32-NEXT:    j .LBB61_15
 ; CHECK-RV32-NEXT:  .LBB61_558: # %cond.load53
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 15, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 14
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 14
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 16
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_559
 ; CHECK-RV32-NEXT:    j .LBB61_16
 ; CHECK-RV32-NEXT:  .LBB61_559: # %cond.load57
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 16, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 15
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 15
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 15
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_560
 ; CHECK-RV32-NEXT:    j .LBB61_17
 ; CHECK-RV32-NEXT:  .LBB61_560: # %cond.load61
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 17, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 16
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 16
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 14
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_561
 ; CHECK-RV32-NEXT:    j .LBB61_18
 ; CHECK-RV32-NEXT:  .LBB61_561: # %cond.load65
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 18, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 17
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 17
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 13
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_562
 ; CHECK-RV32-NEXT:    j .LBB61_19
 ; CHECK-RV32-NEXT:  .LBB61_562: # %cond.load69
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 19, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 18
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 18
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 12
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_563
 ; CHECK-RV32-NEXT:    j .LBB61_20
 ; CHECK-RV32-NEXT:  .LBB61_563: # %cond.load73
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 20, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 19
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 19
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 11
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_564
 ; CHECK-RV32-NEXT:    j .LBB61_21
 ; CHECK-RV32-NEXT:  .LBB61_564: # %cond.load77
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 21, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 20
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 20
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 10
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_565
 ; CHECK-RV32-NEXT:    j .LBB61_22
 ; CHECK-RV32-NEXT:  .LBB61_565: # %cond.load81
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 22, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 21
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 21
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 9
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_566
 ; CHECK-RV32-NEXT:    j .LBB61_23
 ; CHECK-RV32-NEXT:  .LBB61_566: # %cond.load85
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 23, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 22
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 22
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 8
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_567
 ; CHECK-RV32-NEXT:    j .LBB61_24
 ; CHECK-RV32-NEXT:  .LBB61_567: # %cond.load89
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 24, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 23
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 23
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 7
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_568
 ; CHECK-RV32-NEXT:    j .LBB61_25
 ; CHECK-RV32-NEXT:  .LBB61_568: # %cond.load93
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 25, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 24
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 24
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 6
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_569
 ; CHECK-RV32-NEXT:    j .LBB61_26
 ; CHECK-RV32-NEXT:  .LBB61_569: # %cond.load97
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 26, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 25
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 25
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 5
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_570
 ; CHECK-RV32-NEXT:    j .LBB61_27
 ; CHECK-RV32-NEXT:  .LBB61_570: # %cond.load101
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 27, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 26
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 26
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 4
 ; CHECK-RV32-NEXT:    bltz a1, .LBB61_571
 ; CHECK-RV32-NEXT:    j .LBB61_28
 ; CHECK-RV32-NEXT:  .LBB61_571: # %cond.load105
 ; CHECK-RV32-NEXT:    lbu a1, 0(a0)
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetivli zero, 28, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a1
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
-; CHECK-RV32-NEXT:    vslideup.vi v8, v16, 27
+; CHECK-RV32-NEXT:    vmv.s.x v9, a1
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 27
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv1r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv1r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a1, a3, 3
 ; CHECK-RV32-NEXT:    bgez a1, .LBB61_1025
 ; CHECK-RV32-NEXT:    j .LBB61_29
@@ -4373,11 +4226,11 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_30
 ; CHECK-RV32-NEXT:  .LBB61_572: # %cond.load121
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 32
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vi v8, v24, 31
+; CHECK-RV32-NEXT:    vslideup.vi v8, v9, 31
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4387,13 +4240,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_573: # %cond.load125
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 33
 ; CHECK-RV32-NEXT:    li a4, 32
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4403,13 +4256,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_574: # %cond.load129
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 34
 ; CHECK-RV32-NEXT:    li a4, 33
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4419,13 +4272,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_575: # %cond.load133
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 35
 ; CHECK-RV32-NEXT:    li a4, 34
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4435,13 +4288,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_576: # %cond.load137
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 36
 ; CHECK-RV32-NEXT:    li a4, 35
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4451,13 +4304,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_577: # %cond.load141
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 37
 ; CHECK-RV32-NEXT:    li a4, 36
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4467,13 +4320,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_578: # %cond.load145
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 38
 ; CHECK-RV32-NEXT:    li a4, 37
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4483,13 +4336,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_579: # %cond.load149
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 39
 ; CHECK-RV32-NEXT:    li a4, 38
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4499,13 +4352,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_580: # %cond.load153
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 40
 ; CHECK-RV32-NEXT:    li a4, 39
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4515,13 +4368,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_581: # %cond.load157
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 41
 ; CHECK-RV32-NEXT:    li a4, 40
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4531,13 +4384,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_582: # %cond.load161
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 42
 ; CHECK-RV32-NEXT:    li a4, 41
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4547,13 +4400,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_583: # %cond.load165
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 43
 ; CHECK-RV32-NEXT:    li a4, 42
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4563,13 +4416,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_584: # %cond.load169
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 44
 ; CHECK-RV32-NEXT:    li a4, 43
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4579,13 +4432,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_585: # %cond.load173
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 45
 ; CHECK-RV32-NEXT:    li a4, 44
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4595,13 +4448,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_586: # %cond.load177
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 46
 ; CHECK-RV32-NEXT:    li a4, 45
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4611,13 +4464,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_587: # %cond.load181
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 47
 ; CHECK-RV32-NEXT:    li a4, 46
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4627,13 +4480,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_588: # %cond.load185
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 48
 ; CHECK-RV32-NEXT:    li a4, 47
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4643,13 +4496,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_589: # %cond.load189
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 49
 ; CHECK-RV32-NEXT:    li a4, 48
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4659,13 +4512,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_590: # %cond.load193
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 50
 ; CHECK-RV32-NEXT:    li a4, 49
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4675,13 +4528,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_591: # %cond.load197
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 51
 ; CHECK-RV32-NEXT:    li a4, 50
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4691,13 +4544,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_592: # %cond.load201
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 52
 ; CHECK-RV32-NEXT:    li a4, 51
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4707,13 +4560,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_593: # %cond.load205
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 53
 ; CHECK-RV32-NEXT:    li a4, 52
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4723,13 +4576,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_594: # %cond.load209
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 54
 ; CHECK-RV32-NEXT:    li a4, 53
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4739,13 +4592,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_595: # %cond.load213
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 55
 ; CHECK-RV32-NEXT:    li a4, 54
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4755,13 +4608,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_596: # %cond.load217
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 56
 ; CHECK-RV32-NEXT:    li a4, 55
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4771,13 +4624,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_597: # %cond.load221
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 57
 ; CHECK-RV32-NEXT:    li a4, 56
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4787,13 +4640,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_598: # %cond.load225
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 58
 ; CHECK-RV32-NEXT:    li a4, 57
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4803,13 +4656,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_599: # %cond.load229
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 59
 ; CHECK-RV32-NEXT:    li a4, 58
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4819,13 +4672,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_600: # %cond.load233
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 60
 ; CHECK-RV32-NEXT:    li a4, 59
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4835,13 +4688,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_601: # %cond.load237
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v9, a3
 ; CHECK-RV32-NEXT:    li a3, 61
 ; CHECK-RV32-NEXT:    li a4, 60
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -4852,12 +4705,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_66
 ; CHECK-RV32-NEXT:  .LBB61_602: # %cond.load249
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v17, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v9, a2
 ; CHECK-RV32-NEXT:    li a2, 64
 ; CHECK-RV32-NEXT:    li a4, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m1, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v17, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v9, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv1r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4867,13 +4720,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_603: # %cond.load253
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 65
 ; CHECK-RV32-NEXT:    li a4, 64
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4883,13 +4736,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_604: # %cond.load257
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 66
 ; CHECK-RV32-NEXT:    li a4, 65
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4899,13 +4752,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_605: # %cond.load261
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 67
 ; CHECK-RV32-NEXT:    li a4, 66
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4915,13 +4768,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_606: # %cond.load265
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 68
 ; CHECK-RV32-NEXT:    li a4, 67
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4931,13 +4784,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_607: # %cond.load269
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 69
 ; CHECK-RV32-NEXT:    li a4, 68
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4947,13 +4800,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_608: # %cond.load273
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 70
 ; CHECK-RV32-NEXT:    li a4, 69
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4963,13 +4816,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_609: # %cond.load277
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 71
 ; CHECK-RV32-NEXT:    li a4, 70
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4979,13 +4832,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_610: # %cond.load281
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 72
 ; CHECK-RV32-NEXT:    li a4, 71
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -4995,13 +4848,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_611: # %cond.load285
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 73
 ; CHECK-RV32-NEXT:    li a4, 72
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5011,13 +4864,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_612: # %cond.load289
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 74
 ; CHECK-RV32-NEXT:    li a4, 73
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5027,13 +4880,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_613: # %cond.load293
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 75
 ; CHECK-RV32-NEXT:    li a4, 74
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5043,13 +4896,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_614: # %cond.load297
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 76
 ; CHECK-RV32-NEXT:    li a4, 75
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5059,13 +4912,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_615: # %cond.load301
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 77
 ; CHECK-RV32-NEXT:    li a4, 76
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5075,13 +4928,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_616: # %cond.load305
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 78
 ; CHECK-RV32-NEXT:    li a4, 77
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5091,13 +4944,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_617: # %cond.load309
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 79
 ; CHECK-RV32-NEXT:    li a4, 78
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5107,13 +4960,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_618: # %cond.load313
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 80
 ; CHECK-RV32-NEXT:    li a4, 79
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5123,13 +4976,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_619: # %cond.load317
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 81
 ; CHECK-RV32-NEXT:    li a4, 80
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5139,13 +4992,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_620: # %cond.load321
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 82
 ; CHECK-RV32-NEXT:    li a4, 81
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5155,13 +5008,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_621: # %cond.load325
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 83
 ; CHECK-RV32-NEXT:    li a4, 82
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5171,13 +5024,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_622: # %cond.load329
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 84
 ; CHECK-RV32-NEXT:    li a4, 83
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5187,13 +5040,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_623: # %cond.load333
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 85
 ; CHECK-RV32-NEXT:    li a4, 84
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5203,13 +5056,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_624: # %cond.load337
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 86
 ; CHECK-RV32-NEXT:    li a4, 85
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5219,13 +5072,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_625: # %cond.load341
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 87
 ; CHECK-RV32-NEXT:    li a4, 86
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5235,13 +5088,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_626: # %cond.load345
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 88
 ; CHECK-RV32-NEXT:    li a4, 87
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5251,13 +5104,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_627: # %cond.load349
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 89
 ; CHECK-RV32-NEXT:    li a4, 88
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5267,13 +5120,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_628: # %cond.load353
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 90
 ; CHECK-RV32-NEXT:    li a4, 89
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5283,13 +5136,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_629: # %cond.load357
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 91
 ; CHECK-RV32-NEXT:    li a4, 90
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5299,13 +5152,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_630: # %cond.load361
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 92
 ; CHECK-RV32-NEXT:    li a4, 91
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5315,13 +5168,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_631: # %cond.load365
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 93
 ; CHECK-RV32-NEXT:    li a4, 92
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5332,12 +5185,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_100
 ; CHECK-RV32-NEXT:  .LBB61_632: # %cond.load377
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 96
 ; CHECK-RV32-NEXT:    li a4, 95
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5347,13 +5200,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_633: # %cond.load381
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 97
 ; CHECK-RV32-NEXT:    li a4, 96
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5363,13 +5216,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_634: # %cond.load385
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 98
 ; CHECK-RV32-NEXT:    li a4, 97
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5379,13 +5232,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_635: # %cond.load389
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 99
 ; CHECK-RV32-NEXT:    li a4, 98
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5395,13 +5248,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_636: # %cond.load393
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 100
 ; CHECK-RV32-NEXT:    li a4, 99
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5411,13 +5264,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_637: # %cond.load397
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 101
 ; CHECK-RV32-NEXT:    li a4, 100
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5427,13 +5280,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_638: # %cond.load401
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 102
 ; CHECK-RV32-NEXT:    li a4, 101
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5443,13 +5296,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_639: # %cond.load405
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 103
 ; CHECK-RV32-NEXT:    li a4, 102
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5459,13 +5312,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_640: # %cond.load409
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 104
 ; CHECK-RV32-NEXT:    li a4, 103
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5475,13 +5328,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_641: # %cond.load413
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 105
 ; CHECK-RV32-NEXT:    li a4, 104
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5491,13 +5344,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_642: # %cond.load417
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 106
 ; CHECK-RV32-NEXT:    li a4, 105
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5507,13 +5360,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_643: # %cond.load421
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 107
 ; CHECK-RV32-NEXT:    li a4, 106
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5523,13 +5376,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_644: # %cond.load425
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 108
 ; CHECK-RV32-NEXT:    li a4, 107
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5539,13 +5392,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_645: # %cond.load429
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 109
 ; CHECK-RV32-NEXT:    li a4, 108
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5555,13 +5408,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_646: # %cond.load433
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 110
 ; CHECK-RV32-NEXT:    li a4, 109
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5571,13 +5424,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_647: # %cond.load437
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 111
 ; CHECK-RV32-NEXT:    li a4, 110
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5587,13 +5440,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_648: # %cond.load441
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 112
 ; CHECK-RV32-NEXT:    li a4, 111
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5603,13 +5456,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_649: # %cond.load445
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 113
 ; CHECK-RV32-NEXT:    li a4, 112
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5619,13 +5472,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_650: # %cond.load449
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 114
 ; CHECK-RV32-NEXT:    li a4, 113
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5635,13 +5488,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_651: # %cond.load453
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 115
 ; CHECK-RV32-NEXT:    li a4, 114
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5651,13 +5504,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_652: # %cond.load457
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 116
 ; CHECK-RV32-NEXT:    li a4, 115
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5667,13 +5520,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_653: # %cond.load461
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 117
 ; CHECK-RV32-NEXT:    li a4, 116
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5683,13 +5536,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_654: # %cond.load465
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 118
 ; CHECK-RV32-NEXT:    li a4, 117
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5699,13 +5552,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_655: # %cond.load469
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 119
 ; CHECK-RV32-NEXT:    li a4, 118
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5715,13 +5568,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_656: # %cond.load473
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 120
 ; CHECK-RV32-NEXT:    li a4, 119
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5731,13 +5584,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_657: # %cond.load477
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 121
 ; CHECK-RV32-NEXT:    li a4, 120
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5747,13 +5600,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_658: # %cond.load481
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 122
 ; CHECK-RV32-NEXT:    li a4, 121
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5763,13 +5616,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_659: # %cond.load485
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 123
 ; CHECK-RV32-NEXT:    li a4, 122
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5779,13 +5632,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_660: # %cond.load489
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 124
 ; CHECK-RV32-NEXT:    li a4, 123
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5795,13 +5648,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_661: # %cond.load493
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v24, a3
 ; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v10, a3
 ; CHECK-RV32-NEXT:    li a3, 125
 ; CHECK-RV32-NEXT:    li a4, 124
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v24, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v16
@@ -5812,12 +5665,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_134
 ; CHECK-RV32-NEXT:  .LBB61_662: # %cond.load505
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV32-NEXT:    li a2, 128
 ; CHECK-RV32-NEXT:    li a4, 127
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v18, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v10, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5827,13 +5680,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_663: # %cond.load509
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 129
 ; CHECK-RV32-NEXT:    li a4, 128
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5843,13 +5696,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_664: # %cond.load513
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 130
 ; CHECK-RV32-NEXT:    li a4, 129
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5859,13 +5712,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_665: # %cond.load517
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 131
 ; CHECK-RV32-NEXT:    li a4, 130
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5875,13 +5728,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_666: # %cond.load521
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 132
 ; CHECK-RV32-NEXT:    li a4, 131
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5891,13 +5744,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_667: # %cond.load525
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 133
 ; CHECK-RV32-NEXT:    li a4, 132
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5907,13 +5760,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_668: # %cond.load529
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 134
 ; CHECK-RV32-NEXT:    li a4, 133
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5923,13 +5776,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_669: # %cond.load533
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 135
 ; CHECK-RV32-NEXT:    li a4, 134
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5939,13 +5792,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_670: # %cond.load537
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 136
 ; CHECK-RV32-NEXT:    li a4, 135
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5955,13 +5808,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_671: # %cond.load541
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 137
 ; CHECK-RV32-NEXT:    li a4, 136
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5971,13 +5824,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_672: # %cond.load545
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 138
 ; CHECK-RV32-NEXT:    li a4, 137
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -5987,13 +5840,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_673: # %cond.load549
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 139
 ; CHECK-RV32-NEXT:    li a4, 138
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6003,13 +5856,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_674: # %cond.load553
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 140
 ; CHECK-RV32-NEXT:    li a4, 139
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6019,13 +5872,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_675: # %cond.load557
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 141
 ; CHECK-RV32-NEXT:    li a4, 140
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6035,13 +5888,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_676: # %cond.load561
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 142
 ; CHECK-RV32-NEXT:    li a4, 141
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6051,13 +5904,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_677: # %cond.load565
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 143
 ; CHECK-RV32-NEXT:    li a4, 142
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6067,13 +5920,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_678: # %cond.load569
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 144
 ; CHECK-RV32-NEXT:    li a4, 143
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6083,13 +5936,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_679: # %cond.load573
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 145
 ; CHECK-RV32-NEXT:    li a4, 144
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6099,13 +5952,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_680: # %cond.load577
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 146
 ; CHECK-RV32-NEXT:    li a4, 145
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6115,13 +5968,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_681: # %cond.load581
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 147
 ; CHECK-RV32-NEXT:    li a4, 146
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6131,13 +5984,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_682: # %cond.load585
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 148
 ; CHECK-RV32-NEXT:    li a4, 147
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6147,13 +6000,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_683: # %cond.load589
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 149
 ; CHECK-RV32-NEXT:    li a4, 148
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6163,13 +6016,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_684: # %cond.load593
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 150
 ; CHECK-RV32-NEXT:    li a4, 149
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6179,13 +6032,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_685: # %cond.load597
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 151
 ; CHECK-RV32-NEXT:    li a4, 150
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6195,13 +6048,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_686: # %cond.load601
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 152
 ; CHECK-RV32-NEXT:    li a4, 151
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6211,13 +6064,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_687: # %cond.load605
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 153
 ; CHECK-RV32-NEXT:    li a4, 152
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6227,13 +6080,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_688: # %cond.load609
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 154
 ; CHECK-RV32-NEXT:    li a4, 153
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6243,13 +6096,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_689: # %cond.load613
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 155
 ; CHECK-RV32-NEXT:    li a4, 154
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6259,13 +6112,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_690: # %cond.load617
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 156
 ; CHECK-RV32-NEXT:    li a4, 155
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6275,13 +6128,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_691: # %cond.load621
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 157
 ; CHECK-RV32-NEXT:    li a4, 156
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6292,479 +6145,479 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_168
 ; CHECK-RV32-NEXT:  .LBB61_692: # %cond.load633
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 160
 ; CHECK-RV32-NEXT:    li a4, 159
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 1
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_693
 ; CHECK-RV32-NEXT:    j .LBB61_172
 ; CHECK-RV32-NEXT:  .LBB61_693: # %cond.load637
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 161
 ; CHECK-RV32-NEXT:    li a4, 160
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 2
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_694
 ; CHECK-RV32-NEXT:    j .LBB61_173
 ; CHECK-RV32-NEXT:  .LBB61_694: # %cond.load641
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 162
 ; CHECK-RV32-NEXT:    li a4, 161
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 4
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_695
 ; CHECK-RV32-NEXT:    j .LBB61_174
 ; CHECK-RV32-NEXT:  .LBB61_695: # %cond.load645
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 163
 ; CHECK-RV32-NEXT:    li a4, 162
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 8
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_696
 ; CHECK-RV32-NEXT:    j .LBB61_175
 ; CHECK-RV32-NEXT:  .LBB61_696: # %cond.load649
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 164
 ; CHECK-RV32-NEXT:    li a4, 163
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 16
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_697
 ; CHECK-RV32-NEXT:    j .LBB61_176
 ; CHECK-RV32-NEXT:  .LBB61_697: # %cond.load653
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 165
 ; CHECK-RV32-NEXT:    li a4, 164
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 32
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_698
 ; CHECK-RV32-NEXT:    j .LBB61_177
 ; CHECK-RV32-NEXT:  .LBB61_698: # %cond.load657
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 166
 ; CHECK-RV32-NEXT:    li a4, 165
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 64
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_699
 ; CHECK-RV32-NEXT:    j .LBB61_178
 ; CHECK-RV32-NEXT:  .LBB61_699: # %cond.load661
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 167
 ; CHECK-RV32-NEXT:    li a4, 166
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 128
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_700
 ; CHECK-RV32-NEXT:    j .LBB61_179
 ; CHECK-RV32-NEXT:  .LBB61_700: # %cond.load665
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 168
 ; CHECK-RV32-NEXT:    li a4, 167
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 256
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_701
 ; CHECK-RV32-NEXT:    j .LBB61_180
 ; CHECK-RV32-NEXT:  .LBB61_701: # %cond.load669
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 169
 ; CHECK-RV32-NEXT:    li a4, 168
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 512
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_702
 ; CHECK-RV32-NEXT:    j .LBB61_181
 ; CHECK-RV32-NEXT:  .LBB61_702: # %cond.load673
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 170
 ; CHECK-RV32-NEXT:    li a4, 169
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 1024
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_703
 ; CHECK-RV32-NEXT:    j .LBB61_182
 ; CHECK-RV32-NEXT:  .LBB61_703: # %cond.load677
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 171
 ; CHECK-RV32-NEXT:    li a4, 170
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 20
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_704
 ; CHECK-RV32-NEXT:    j .LBB61_183
 ; CHECK-RV32-NEXT:  .LBB61_704: # %cond.load681
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 172
 ; CHECK-RV32-NEXT:    li a4, 171
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 19
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_705
 ; CHECK-RV32-NEXT:    j .LBB61_184
 ; CHECK-RV32-NEXT:  .LBB61_705: # %cond.load685
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 173
 ; CHECK-RV32-NEXT:    li a4, 172
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 18
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_706
 ; CHECK-RV32-NEXT:    j .LBB61_185
 ; CHECK-RV32-NEXT:  .LBB61_706: # %cond.load689
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 174
 ; CHECK-RV32-NEXT:    li a4, 173
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 17
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_707
 ; CHECK-RV32-NEXT:    j .LBB61_186
 ; CHECK-RV32-NEXT:  .LBB61_707: # %cond.load693
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 175
 ; CHECK-RV32-NEXT:    li a4, 174
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 16
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_708
 ; CHECK-RV32-NEXT:    j .LBB61_187
 ; CHECK-RV32-NEXT:  .LBB61_708: # %cond.load697
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 176
 ; CHECK-RV32-NEXT:    li a4, 175
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 15
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_709
 ; CHECK-RV32-NEXT:    j .LBB61_188
 ; CHECK-RV32-NEXT:  .LBB61_709: # %cond.load701
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 177
 ; CHECK-RV32-NEXT:    li a4, 176
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 14
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_710
 ; CHECK-RV32-NEXT:    j .LBB61_189
 ; CHECK-RV32-NEXT:  .LBB61_710: # %cond.load705
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 178
 ; CHECK-RV32-NEXT:    li a4, 177
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 13
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_711
 ; CHECK-RV32-NEXT:    j .LBB61_190
 ; CHECK-RV32-NEXT:  .LBB61_711: # %cond.load709
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 179
 ; CHECK-RV32-NEXT:    li a4, 178
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 12
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_712
 ; CHECK-RV32-NEXT:    j .LBB61_191
 ; CHECK-RV32-NEXT:  .LBB61_712: # %cond.load713
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 180
 ; CHECK-RV32-NEXT:    li a4, 179
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 11
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_713
 ; CHECK-RV32-NEXT:    j .LBB61_192
 ; CHECK-RV32-NEXT:  .LBB61_713: # %cond.load717
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 181
 ; CHECK-RV32-NEXT:    li a4, 180
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 10
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_714
 ; CHECK-RV32-NEXT:    j .LBB61_193
 ; CHECK-RV32-NEXT:  .LBB61_714: # %cond.load721
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 182
 ; CHECK-RV32-NEXT:    li a4, 181
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 9
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_715
 ; CHECK-RV32-NEXT:    j .LBB61_194
 ; CHECK-RV32-NEXT:  .LBB61_715: # %cond.load725
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 183
 ; CHECK-RV32-NEXT:    li a4, 182
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 8
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_716
 ; CHECK-RV32-NEXT:    j .LBB61_195
 ; CHECK-RV32-NEXT:  .LBB61_716: # %cond.load729
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 184
 ; CHECK-RV32-NEXT:    li a4, 183
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 7
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_717
 ; CHECK-RV32-NEXT:    j .LBB61_196
 ; CHECK-RV32-NEXT:  .LBB61_717: # %cond.load733
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 185
 ; CHECK-RV32-NEXT:    li a4, 184
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 6
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_718
 ; CHECK-RV32-NEXT:    j .LBB61_197
 ; CHECK-RV32-NEXT:  .LBB61_718: # %cond.load737
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 186
 ; CHECK-RV32-NEXT:    li a4, 185
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 5
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_719
 ; CHECK-RV32-NEXT:    j .LBB61_198
 ; CHECK-RV32-NEXT:  .LBB61_719: # %cond.load741
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 187
 ; CHECK-RV32-NEXT:    li a4, 186
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 4
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_720
 ; CHECK-RV32-NEXT:    j .LBB61_199
 ; CHECK-RV32-NEXT:  .LBB61_720: # %cond.load745
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 188
 ; CHECK-RV32-NEXT:    li a4, 187
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 3
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_721
 ; CHECK-RV32-NEXT:    j .LBB61_200
 ; CHECK-RV32-NEXT:  .LBB61_721: # %cond.load749
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 189
 ; CHECK-RV32-NEXT:    li a4, 188
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 2
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_1030
 ; CHECK-RV32-NEXT:    j .LBB61_201
@@ -6772,12 +6625,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_202
 ; CHECK-RV32-NEXT:  .LBB61_722: # %cond.load761
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 192
 ; CHECK-RV32-NEXT:    li a4, 191
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6787,13 +6640,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_723: # %cond.load765
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 193
 ; CHECK-RV32-NEXT:    li a4, 192
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6803,13 +6656,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_724: # %cond.load769
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 194
 ; CHECK-RV32-NEXT:    li a4, 193
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6819,13 +6672,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_725: # %cond.load773
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 195
 ; CHECK-RV32-NEXT:    li a4, 194
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6835,13 +6688,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_726: # %cond.load777
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 196
 ; CHECK-RV32-NEXT:    li a4, 195
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6851,13 +6704,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_727: # %cond.load781
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 197
 ; CHECK-RV32-NEXT:    li a4, 196
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6867,13 +6720,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_728: # %cond.load785
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 198
 ; CHECK-RV32-NEXT:    li a4, 197
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6883,13 +6736,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_729: # %cond.load789
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 199
 ; CHECK-RV32-NEXT:    li a4, 198
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6899,13 +6752,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_730: # %cond.load793
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 200
 ; CHECK-RV32-NEXT:    li a4, 199
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6915,13 +6768,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_731: # %cond.load797
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 201
 ; CHECK-RV32-NEXT:    li a4, 200
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6931,13 +6784,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_732: # %cond.load801
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 202
 ; CHECK-RV32-NEXT:    li a4, 201
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6947,13 +6800,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_733: # %cond.load805
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 203
 ; CHECK-RV32-NEXT:    li a4, 202
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6963,13 +6816,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_734: # %cond.load809
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 204
 ; CHECK-RV32-NEXT:    li a4, 203
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6979,13 +6832,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_735: # %cond.load813
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 205
 ; CHECK-RV32-NEXT:    li a4, 204
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -6995,13 +6848,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_736: # %cond.load817
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 206
 ; CHECK-RV32-NEXT:    li a4, 205
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7011,13 +6864,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_737: # %cond.load821
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 207
 ; CHECK-RV32-NEXT:    li a4, 206
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7027,13 +6880,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_738: # %cond.load825
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 208
 ; CHECK-RV32-NEXT:    li a4, 207
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7043,13 +6896,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_739: # %cond.load829
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 209
 ; CHECK-RV32-NEXT:    li a4, 208
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7059,13 +6912,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_740: # %cond.load833
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 210
 ; CHECK-RV32-NEXT:    li a4, 209
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7075,13 +6928,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_741: # %cond.load837
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 211
 ; CHECK-RV32-NEXT:    li a4, 210
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7091,13 +6944,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_742: # %cond.load841
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 212
 ; CHECK-RV32-NEXT:    li a4, 211
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7107,13 +6960,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_743: # %cond.load845
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 213
 ; CHECK-RV32-NEXT:    li a4, 212
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7123,13 +6976,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_744: # %cond.load849
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 214
 ; CHECK-RV32-NEXT:    li a4, 213
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7139,13 +6992,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_745: # %cond.load853
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 215
 ; CHECK-RV32-NEXT:    li a4, 214
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7155,13 +7008,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_746: # %cond.load857
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 216
 ; CHECK-RV32-NEXT:    li a4, 215
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7171,13 +7024,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_747: # %cond.load861
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 217
 ; CHECK-RV32-NEXT:    li a4, 216
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7187,13 +7040,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_748: # %cond.load865
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 218
 ; CHECK-RV32-NEXT:    li a4, 217
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7203,13 +7056,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_749: # %cond.load869
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 219
 ; CHECK-RV32-NEXT:    li a4, 218
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7219,13 +7072,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_750: # %cond.load873
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 220
 ; CHECK-RV32-NEXT:    li a4, 219
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7235,13 +7088,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:  .LBB61_751: # %cond.load877
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
-; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 221
 ; CHECK-RV32-NEXT:    li a4, 220
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -7252,479 +7105,479 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_236
 ; CHECK-RV32-NEXT:  .LBB61_752: # %cond.load889
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 224
 ; CHECK-RV32-NEXT:    li a4, 223
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 1
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_753
 ; CHECK-RV32-NEXT:    j .LBB61_240
 ; CHECK-RV32-NEXT:  .LBB61_753: # %cond.load893
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 225
 ; CHECK-RV32-NEXT:    li a4, 224
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 2
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_754
 ; CHECK-RV32-NEXT:    j .LBB61_241
 ; CHECK-RV32-NEXT:  .LBB61_754: # %cond.load897
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 226
 ; CHECK-RV32-NEXT:    li a4, 225
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 4
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_755
 ; CHECK-RV32-NEXT:    j .LBB61_242
 ; CHECK-RV32-NEXT:  .LBB61_755: # %cond.load901
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 227
 ; CHECK-RV32-NEXT:    li a4, 226
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 8
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_756
 ; CHECK-RV32-NEXT:    j .LBB61_243
 ; CHECK-RV32-NEXT:  .LBB61_756: # %cond.load905
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 228
 ; CHECK-RV32-NEXT:    li a4, 227
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 16
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_757
 ; CHECK-RV32-NEXT:    j .LBB61_244
 ; CHECK-RV32-NEXT:  .LBB61_757: # %cond.load909
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 229
 ; CHECK-RV32-NEXT:    li a4, 228
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 32
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_758
 ; CHECK-RV32-NEXT:    j .LBB61_245
 ; CHECK-RV32-NEXT:  .LBB61_758: # %cond.load913
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 230
 ; CHECK-RV32-NEXT:    li a4, 229
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 64
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_759
 ; CHECK-RV32-NEXT:    j .LBB61_246
 ; CHECK-RV32-NEXT:  .LBB61_759: # %cond.load917
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 231
 ; CHECK-RV32-NEXT:    li a4, 230
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 128
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_760
 ; CHECK-RV32-NEXT:    j .LBB61_247
 ; CHECK-RV32-NEXT:  .LBB61_760: # %cond.load921
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 232
 ; CHECK-RV32-NEXT:    li a4, 231
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 256
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_761
 ; CHECK-RV32-NEXT:    j .LBB61_248
 ; CHECK-RV32-NEXT:  .LBB61_761: # %cond.load925
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 233
 ; CHECK-RV32-NEXT:    li a4, 232
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 512
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_762
 ; CHECK-RV32-NEXT:    j .LBB61_249
 ; CHECK-RV32-NEXT:  .LBB61_762: # %cond.load929
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 234
 ; CHECK-RV32-NEXT:    li a4, 233
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    andi a3, a2, 1024
 ; CHECK-RV32-NEXT:    bnez a3, .LBB61_763
 ; CHECK-RV32-NEXT:    j .LBB61_250
 ; CHECK-RV32-NEXT:  .LBB61_763: # %cond.load933
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 235
 ; CHECK-RV32-NEXT:    li a4, 234
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 20
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_764
 ; CHECK-RV32-NEXT:    j .LBB61_251
 ; CHECK-RV32-NEXT:  .LBB61_764: # %cond.load937
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 236
 ; CHECK-RV32-NEXT:    li a4, 235
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 19
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_765
 ; CHECK-RV32-NEXT:    j .LBB61_252
 ; CHECK-RV32-NEXT:  .LBB61_765: # %cond.load941
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 237
 ; CHECK-RV32-NEXT:    li a4, 236
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 18
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_766
 ; CHECK-RV32-NEXT:    j .LBB61_253
 ; CHECK-RV32-NEXT:  .LBB61_766: # %cond.load945
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 238
 ; CHECK-RV32-NEXT:    li a4, 237
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 17
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_767
 ; CHECK-RV32-NEXT:    j .LBB61_254
 ; CHECK-RV32-NEXT:  .LBB61_767: # %cond.load949
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 239
 ; CHECK-RV32-NEXT:    li a4, 238
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 16
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_768
 ; CHECK-RV32-NEXT:    j .LBB61_255
 ; CHECK-RV32-NEXT:  .LBB61_768: # %cond.load953
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 240
 ; CHECK-RV32-NEXT:    li a4, 239
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 15
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_769
 ; CHECK-RV32-NEXT:    j .LBB61_256
 ; CHECK-RV32-NEXT:  .LBB61_769: # %cond.load957
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 241
 ; CHECK-RV32-NEXT:    li a4, 240
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 14
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_770
 ; CHECK-RV32-NEXT:    j .LBB61_257
 ; CHECK-RV32-NEXT:  .LBB61_770: # %cond.load961
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 242
 ; CHECK-RV32-NEXT:    li a4, 241
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 13
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_771
 ; CHECK-RV32-NEXT:    j .LBB61_258
 ; CHECK-RV32-NEXT:  .LBB61_771: # %cond.load965
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 243
 ; CHECK-RV32-NEXT:    li a4, 242
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 12
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_772
 ; CHECK-RV32-NEXT:    j .LBB61_259
 ; CHECK-RV32-NEXT:  .LBB61_772: # %cond.load969
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 244
 ; CHECK-RV32-NEXT:    li a4, 243
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 11
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_773
 ; CHECK-RV32-NEXT:    j .LBB61_260
 ; CHECK-RV32-NEXT:  .LBB61_773: # %cond.load973
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 245
 ; CHECK-RV32-NEXT:    li a4, 244
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 10
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_774
 ; CHECK-RV32-NEXT:    j .LBB61_261
 ; CHECK-RV32-NEXT:  .LBB61_774: # %cond.load977
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 246
 ; CHECK-RV32-NEXT:    li a4, 245
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 9
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_775
 ; CHECK-RV32-NEXT:    j .LBB61_262
 ; CHECK-RV32-NEXT:  .LBB61_775: # %cond.load981
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 247
 ; CHECK-RV32-NEXT:    li a4, 246
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 8
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_776
 ; CHECK-RV32-NEXT:    j .LBB61_263
 ; CHECK-RV32-NEXT:  .LBB61_776: # %cond.load985
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 248
 ; CHECK-RV32-NEXT:    li a4, 247
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 7
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_777
 ; CHECK-RV32-NEXT:    j .LBB61_264
 ; CHECK-RV32-NEXT:  .LBB61_777: # %cond.load989
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 249
 ; CHECK-RV32-NEXT:    li a4, 248
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 6
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_778
 ; CHECK-RV32-NEXT:    j .LBB61_265
 ; CHECK-RV32-NEXT:  .LBB61_778: # %cond.load993
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 250
 ; CHECK-RV32-NEXT:    li a4, 249
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 5
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_779
 ; CHECK-RV32-NEXT:    j .LBB61_266
 ; CHECK-RV32-NEXT:  .LBB61_779: # %cond.load997
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 251
 ; CHECK-RV32-NEXT:    li a4, 250
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 4
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_780
 ; CHECK-RV32-NEXT:    j .LBB61_267
 ; CHECK-RV32-NEXT:  .LBB61_780: # %cond.load1001
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 252
 ; CHECK-RV32-NEXT:    li a4, 251
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 3
 ; CHECK-RV32-NEXT:    bltz a3, .LBB61_781
 ; CHECK-RV32-NEXT:    j .LBB61_268
 ; CHECK-RV32-NEXT:  .LBB61_781: # %cond.load1005
 ; CHECK-RV32-NEXT:    lbu a3, 0(a0)
 ; CHECK-RV32-NEXT:    li a4, 512
+; CHECK-RV32-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.s.x v16, a3
-; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a3
 ; CHECK-RV32-NEXT:    li a3, 253
 ; CHECK-RV32-NEXT:    li a4, 252
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v16, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
-; CHECK-RV32-NEXT:    vmv4r.v v24, v8
-; CHECK-RV32-NEXT:    vmv8r.v v8, v24
+; CHECK-RV32-NEXT:    vmv4r.v v16, v8
+; CHECK-RV32-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV32-NEXT:    slli a3, a2, 2
 ; CHECK-RV32-NEXT:    bgez a3, .LBB61_1032
 ; CHECK-RV32-NEXT:    j .LBB61_269
@@ -7732,12 +7585,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV32-NEXT:    j .LBB61_270
 ; CHECK-RV32-NEXT:  .LBB61_782: # %cond.load1017
 ; CHECK-RV32-NEXT:    lbu a2, 0(a0)
-; CHECK-RV32-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV32-NEXT:    vmv8r.v v24, v8
+; CHECK-RV32-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV32-NEXT:    li a2, 256
 ; CHECK-RV32-NEXT:    li a4, 255
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV32-NEXT:    vslideup.vx v8, v20, a4
+; CHECK-RV32-NEXT:    vslideup.vx v8, v12, a4
 ; CHECK-RV32-NEXT:    addi a0, a0, 1
 ; CHECK-RV32-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV32-NEXT:    vmv8r.v v8, v24
@@ -11138,13 +10991,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_62: # %cond.load241
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 62
 ; CHECK-RV64-NEXT:    li a3, 61
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -11155,12 +11008,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_65
 ; CHECK-RV64-NEXT:  # %bb.64: # %cond.load245
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v17, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 63
 ; CHECK-RV64-NEXT:    li a3, 62
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v17, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v24, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v24
@@ -11419,13 +11272,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_128: # %cond.load497
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 126
 ; CHECK-RV64-NEXT:    li a3, 125
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -11436,12 +11289,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    bgez a2, .LBB61_131
 ; CHECK-RV64-NEXT:  # %bb.130: # %cond.load501
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v18, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 127
 ; CHECK-RV64-NEXT:    li a3, 126
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v18, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v24, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v24
@@ -11700,16 +11553,16 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_194: # %cond.load753
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 190
 ; CHECK-RV64-NEXT:    li a3, 189
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:  .LBB61_195: # %else754
 ; CHECK-RV64-NEXT:    slli a1, a2, 1
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -11717,12 +11570,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_197
 ; CHECK-RV64-NEXT:  # %bb.196: # %cond.load757
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v20, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 191
 ; CHECK-RV64-NEXT:    li a3, 190
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v20, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v24
@@ -11981,16 +11834,16 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_260: # %cond.load1009
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 254
 ; CHECK-RV64-NEXT:    li a3, 253
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:  .LBB61_261: # %else1010
 ; CHECK-RV64-NEXT:    slli a2, a1, 1
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -11998,12 +11851,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    bgez a2, .LBB61_263
 ; CHECK-RV64-NEXT:  # %bb.262: # %cond.load1013
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v20, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 255
 ; CHECK-RV64-NEXT:    li a3, 254
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v20, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv4r.v v24, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v24
@@ -13107,374 +12960,374 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    j .LBB61_2
 ; CHECK-RV64-NEXT:  .LBB61_528: # %cond.load1
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    vsetivli zero, 2, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 4
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_529
 ; CHECK-RV64-NEXT:    j .LBB61_3
 ; CHECK-RV64-NEXT:  .LBB61_529: # %cond.load5
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 3, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 2
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 8
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_530
 ; CHECK-RV64-NEXT:    j .LBB61_4
 ; CHECK-RV64-NEXT:  .LBB61_530: # %cond.load9
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 4, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 3
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 16
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_531
 ; CHECK-RV64-NEXT:    j .LBB61_5
 ; CHECK-RV64-NEXT:  .LBB61_531: # %cond.load13
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 5, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 4
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 4
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 32
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_532
 ; CHECK-RV64-NEXT:    j .LBB61_6
 ; CHECK-RV64-NEXT:  .LBB61_532: # %cond.load17
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 6, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 5
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 5
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 64
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_533
 ; CHECK-RV64-NEXT:    j .LBB61_7
 ; CHECK-RV64-NEXT:  .LBB61_533: # %cond.load21
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 7, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 6
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 6
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 128
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_534
 ; CHECK-RV64-NEXT:    j .LBB61_8
 ; CHECK-RV64-NEXT:  .LBB61_534: # %cond.load25
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 8, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 7
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 7
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 256
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_535
 ; CHECK-RV64-NEXT:    j .LBB61_9
 ; CHECK-RV64-NEXT:  .LBB61_535: # %cond.load29
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 9, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 8
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 8
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 512
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_536
 ; CHECK-RV64-NEXT:    j .LBB61_10
 ; CHECK-RV64-NEXT:  .LBB61_536: # %cond.load33
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 10, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 9
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 9
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 1024
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_537
 ; CHECK-RV64-NEXT:    j .LBB61_11
 ; CHECK-RV64-NEXT:  .LBB61_537: # %cond.load37
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 11, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 10
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 10
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 52
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_538
 ; CHECK-RV64-NEXT:    j .LBB61_12
 ; CHECK-RV64-NEXT:  .LBB61_538: # %cond.load41
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 12, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 11
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 11
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 51
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_539
 ; CHECK-RV64-NEXT:    j .LBB61_13
 ; CHECK-RV64-NEXT:  .LBB61_539: # %cond.load45
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 13, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 12
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 12
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 50
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_540
 ; CHECK-RV64-NEXT:    j .LBB61_14
 ; CHECK-RV64-NEXT:  .LBB61_540: # %cond.load49
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 14, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 13
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 13
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 49
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_541
 ; CHECK-RV64-NEXT:    j .LBB61_15
 ; CHECK-RV64-NEXT:  .LBB61_541: # %cond.load53
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 15, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 14
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 14
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 48
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_542
 ; CHECK-RV64-NEXT:    j .LBB61_16
 ; CHECK-RV64-NEXT:  .LBB61_542: # %cond.load57
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 16, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 15
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 15
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 47
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_543
 ; CHECK-RV64-NEXT:    j .LBB61_17
 ; CHECK-RV64-NEXT:  .LBB61_543: # %cond.load61
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 17, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 16
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 16
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 46
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_544
 ; CHECK-RV64-NEXT:    j .LBB61_18
 ; CHECK-RV64-NEXT:  .LBB61_544: # %cond.load65
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 18, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 17
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 17
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 45
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_545
 ; CHECK-RV64-NEXT:    j .LBB61_19
 ; CHECK-RV64-NEXT:  .LBB61_545: # %cond.load69
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 19, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 18
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 18
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 44
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_546
 ; CHECK-RV64-NEXT:    j .LBB61_20
 ; CHECK-RV64-NEXT:  .LBB61_546: # %cond.load73
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 20, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 19
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 19
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 43
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_547
 ; CHECK-RV64-NEXT:    j .LBB61_21
 ; CHECK-RV64-NEXT:  .LBB61_547: # %cond.load77
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 21, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 20
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 20
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 42
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_548
 ; CHECK-RV64-NEXT:    j .LBB61_22
 ; CHECK-RV64-NEXT:  .LBB61_548: # %cond.load81
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 22, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 21
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 21
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 41
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_549
 ; CHECK-RV64-NEXT:    j .LBB61_23
 ; CHECK-RV64-NEXT:  .LBB61_549: # %cond.load85
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 23, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 22
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 22
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 40
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_550
 ; CHECK-RV64-NEXT:    j .LBB61_24
 ; CHECK-RV64-NEXT:  .LBB61_550: # %cond.load89
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 24, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 23
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 23
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 39
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_551
 ; CHECK-RV64-NEXT:    j .LBB61_25
 ; CHECK-RV64-NEXT:  .LBB61_551: # %cond.load93
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 25, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 24
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 24
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 38
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_552
 ; CHECK-RV64-NEXT:    j .LBB61_26
 ; CHECK-RV64-NEXT:  .LBB61_552: # %cond.load97
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 26, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 25
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 25
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 37
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_553
 ; CHECK-RV64-NEXT:    j .LBB61_27
 ; CHECK-RV64-NEXT:  .LBB61_553: # %cond.load101
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 27, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 26
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 26
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 36
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_554
 ; CHECK-RV64-NEXT:    j .LBB61_28
 ; CHECK-RV64-NEXT:  .LBB61_554: # %cond.load105
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 28, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 27
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 27
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 35
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_555
 ; CHECK-RV64-NEXT:    j .LBB61_29
 ; CHECK-RV64-NEXT:  .LBB61_555: # %cond.load109
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 29, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 28
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 28
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 34
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_556
 ; CHECK-RV64-NEXT:    j .LBB61_30
 ; CHECK-RV64-NEXT:  .LBB61_556: # %cond.load113
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 30, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 29
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 29
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 33
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_557
 ; CHECK-RV64-NEXT:    j .LBB61_31
 ; CHECK-RV64-NEXT:  .LBB61_557: # %cond.load117
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetivli zero, 31, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
-; CHECK-RV64-NEXT:    vslideup.vi v8, v16, 30
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 30
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv1r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv1r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 32
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_558
 ; CHECK-RV64-NEXT:    j .LBB61_32
 ; CHECK-RV64-NEXT:  .LBB61_558: # %cond.load121
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 32
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vi v8, v24, 31
+; CHECK-RV64-NEXT:    vslideup.vi v8, v9, 31
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13484,13 +13337,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_559: # %cond.load125
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 33
 ; CHECK-RV64-NEXT:    li a3, 32
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13500,13 +13353,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_560: # %cond.load129
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 34
 ; CHECK-RV64-NEXT:    li a3, 33
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13516,13 +13369,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_561: # %cond.load133
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 35
 ; CHECK-RV64-NEXT:    li a3, 34
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13532,13 +13385,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_562: # %cond.load137
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 36
 ; CHECK-RV64-NEXT:    li a3, 35
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13548,13 +13401,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_563: # %cond.load141
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 37
 ; CHECK-RV64-NEXT:    li a3, 36
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13564,13 +13417,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_564: # %cond.load145
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 38
 ; CHECK-RV64-NEXT:    li a3, 37
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13580,13 +13433,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_565: # %cond.load149
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 39
 ; CHECK-RV64-NEXT:    li a3, 38
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13596,13 +13449,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_566: # %cond.load153
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 40
 ; CHECK-RV64-NEXT:    li a3, 39
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13612,13 +13465,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_567: # %cond.load157
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 41
 ; CHECK-RV64-NEXT:    li a3, 40
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13628,13 +13481,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_568: # %cond.load161
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 42
 ; CHECK-RV64-NEXT:    li a3, 41
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13644,13 +13497,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_569: # %cond.load165
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 43
 ; CHECK-RV64-NEXT:    li a3, 42
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13660,13 +13513,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_570: # %cond.load169
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 44
 ; CHECK-RV64-NEXT:    li a3, 43
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13676,13 +13529,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_571: # %cond.load173
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 45
 ; CHECK-RV64-NEXT:    li a3, 44
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13692,13 +13545,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_572: # %cond.load177
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 46
 ; CHECK-RV64-NEXT:    li a3, 45
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13708,13 +13561,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_573: # %cond.load181
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 47
 ; CHECK-RV64-NEXT:    li a3, 46
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13724,13 +13577,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_574: # %cond.load185
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 48
 ; CHECK-RV64-NEXT:    li a3, 47
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13740,13 +13593,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_575: # %cond.load189
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 49
 ; CHECK-RV64-NEXT:    li a3, 48
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13756,13 +13609,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_576: # %cond.load193
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 50
 ; CHECK-RV64-NEXT:    li a3, 49
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13772,13 +13625,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_577: # %cond.load197
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 51
 ; CHECK-RV64-NEXT:    li a3, 50
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13788,13 +13641,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_578: # %cond.load201
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 52
 ; CHECK-RV64-NEXT:    li a3, 51
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13804,13 +13657,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_579: # %cond.load205
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 53
 ; CHECK-RV64-NEXT:    li a3, 52
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13820,13 +13673,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_580: # %cond.load209
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 54
 ; CHECK-RV64-NEXT:    li a3, 53
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13836,13 +13689,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_581: # %cond.load213
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 55
 ; CHECK-RV64-NEXT:    li a3, 54
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13852,13 +13705,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_582: # %cond.load217
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 56
 ; CHECK-RV64-NEXT:    li a3, 55
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13868,13 +13721,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_583: # %cond.load221
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 57
 ; CHECK-RV64-NEXT:    li a3, 56
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13884,13 +13737,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_584: # %cond.load225
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 58
 ; CHECK-RV64-NEXT:    li a3, 57
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13900,13 +13753,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_585: # %cond.load229
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 59
 ; CHECK-RV64-NEXT:    li a3, 58
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13916,13 +13769,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_586: # %cond.load233
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 60
 ; CHECK-RV64-NEXT:    li a3, 59
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13932,13 +13785,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_587: # %cond.load237
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v9, a1
 ; CHECK-RV64-NEXT:    li a1, 61
 ; CHECK-RV64-NEXT:    li a3, 60
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13949,12 +13802,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    j .LBB61_63
 ; CHECK-RV64-NEXT:  .LBB61_588: # %cond.load249
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vmv.s.x v9, a2
 ; CHECK-RV64-NEXT:    li a2, 64
 ; CHECK-RV64-NEXT:    li a3, 63
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m1, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v9, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv1r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13964,13 +13817,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_589: # %cond.load253
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 65
 ; CHECK-RV64-NEXT:    li a3, 64
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13980,13 +13833,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_590: # %cond.load257
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 66
 ; CHECK-RV64-NEXT:    li a3, 65
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -13996,13 +13849,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_591: # %cond.load261
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 67
 ; CHECK-RV64-NEXT:    li a3, 66
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14012,13 +13865,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_592: # %cond.load265
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 68
 ; CHECK-RV64-NEXT:    li a3, 67
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14028,13 +13881,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_593: # %cond.load269
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 69
 ; CHECK-RV64-NEXT:    li a3, 68
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14044,13 +13897,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_594: # %cond.load273
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 70
 ; CHECK-RV64-NEXT:    li a3, 69
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14060,13 +13913,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_595: # %cond.load277
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 71
 ; CHECK-RV64-NEXT:    li a3, 70
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14076,13 +13929,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_596: # %cond.load281
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 72
 ; CHECK-RV64-NEXT:    li a3, 71
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14092,13 +13945,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_597: # %cond.load285
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 73
 ; CHECK-RV64-NEXT:    li a3, 72
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14108,13 +13961,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_598: # %cond.load289
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 74
 ; CHECK-RV64-NEXT:    li a3, 73
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14124,13 +13977,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_599: # %cond.load293
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 75
 ; CHECK-RV64-NEXT:    li a3, 74
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14140,13 +13993,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_600: # %cond.load297
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 76
 ; CHECK-RV64-NEXT:    li a3, 75
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14156,13 +14009,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_601: # %cond.load301
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 77
 ; CHECK-RV64-NEXT:    li a3, 76
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14172,13 +14025,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_602: # %cond.load305
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 78
 ; CHECK-RV64-NEXT:    li a3, 77
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14188,13 +14041,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_603: # %cond.load309
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 79
 ; CHECK-RV64-NEXT:    li a3, 78
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14204,13 +14057,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_604: # %cond.load313
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 80
 ; CHECK-RV64-NEXT:    li a3, 79
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14220,13 +14073,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_605: # %cond.load317
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 81
 ; CHECK-RV64-NEXT:    li a3, 80
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14236,13 +14089,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_606: # %cond.load321
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 82
 ; CHECK-RV64-NEXT:    li a3, 81
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14252,13 +14105,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_607: # %cond.load325
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 83
 ; CHECK-RV64-NEXT:    li a3, 82
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14268,13 +14121,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_608: # %cond.load329
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 84
 ; CHECK-RV64-NEXT:    li a3, 83
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14284,13 +14137,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_609: # %cond.load333
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 85
 ; CHECK-RV64-NEXT:    li a3, 84
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14300,13 +14153,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_610: # %cond.load337
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 86
 ; CHECK-RV64-NEXT:    li a3, 85
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14316,13 +14169,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_611: # %cond.load341
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 87
 ; CHECK-RV64-NEXT:    li a3, 86
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14332,13 +14185,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_612: # %cond.load345
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 88
 ; CHECK-RV64-NEXT:    li a3, 87
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14348,13 +14201,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_613: # %cond.load349
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 89
 ; CHECK-RV64-NEXT:    li a3, 88
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14364,13 +14217,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_614: # %cond.load353
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 90
 ; CHECK-RV64-NEXT:    li a3, 89
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14380,13 +14233,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_615: # %cond.load357
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 91
 ; CHECK-RV64-NEXT:    li a3, 90
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14396,13 +14249,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_616: # %cond.load361
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 92
 ; CHECK-RV64-NEXT:    li a3, 91
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14412,13 +14265,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_617: # %cond.load365
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 93
 ; CHECK-RV64-NEXT:    li a3, 92
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14428,13 +14281,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_618: # %cond.load369
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 94
 ; CHECK-RV64-NEXT:    li a3, 93
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14444,13 +14297,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_619: # %cond.load373
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 95
 ; CHECK-RV64-NEXT:    li a3, 94
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14460,13 +14313,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_620: # %cond.load377
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 96
 ; CHECK-RV64-NEXT:    li a3, 95
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14476,13 +14329,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_621: # %cond.load381
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 97
 ; CHECK-RV64-NEXT:    li a3, 96
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14492,13 +14345,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_622: # %cond.load385
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 98
 ; CHECK-RV64-NEXT:    li a3, 97
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14508,13 +14361,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_623: # %cond.load389
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 99
 ; CHECK-RV64-NEXT:    li a3, 98
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14524,13 +14377,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_624: # %cond.load393
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 100
 ; CHECK-RV64-NEXT:    li a3, 99
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14540,13 +14393,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_625: # %cond.load397
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 101
 ; CHECK-RV64-NEXT:    li a3, 100
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14556,13 +14409,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_626: # %cond.load401
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 102
 ; CHECK-RV64-NEXT:    li a3, 101
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14572,13 +14425,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_627: # %cond.load405
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 103
 ; CHECK-RV64-NEXT:    li a3, 102
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14588,13 +14441,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_628: # %cond.load409
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 104
 ; CHECK-RV64-NEXT:    li a3, 103
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14604,13 +14457,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_629: # %cond.load413
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 105
 ; CHECK-RV64-NEXT:    li a3, 104
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14620,13 +14473,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_630: # %cond.load417
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 106
 ; CHECK-RV64-NEXT:    li a3, 105
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14636,13 +14489,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_631: # %cond.load421
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 107
 ; CHECK-RV64-NEXT:    li a3, 106
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14652,13 +14505,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_632: # %cond.load425
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 108
 ; CHECK-RV64-NEXT:    li a3, 107
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14668,13 +14521,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_633: # %cond.load429
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 109
 ; CHECK-RV64-NEXT:    li a3, 108
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14684,13 +14537,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_634: # %cond.load433
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 110
 ; CHECK-RV64-NEXT:    li a3, 109
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14700,13 +14553,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_635: # %cond.load437
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 111
 ; CHECK-RV64-NEXT:    li a3, 110
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14716,13 +14569,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_636: # %cond.load441
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 112
 ; CHECK-RV64-NEXT:    li a3, 111
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14732,13 +14585,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_637: # %cond.load445
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 113
 ; CHECK-RV64-NEXT:    li a3, 112
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14748,13 +14601,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_638: # %cond.load449
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 114
 ; CHECK-RV64-NEXT:    li a3, 113
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14764,13 +14617,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_639: # %cond.load453
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 115
 ; CHECK-RV64-NEXT:    li a3, 114
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14780,13 +14633,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_640: # %cond.load457
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 116
 ; CHECK-RV64-NEXT:    li a3, 115
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14796,13 +14649,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_641: # %cond.load461
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 117
 ; CHECK-RV64-NEXT:    li a3, 116
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14812,13 +14665,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_642: # %cond.load465
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 118
 ; CHECK-RV64-NEXT:    li a3, 117
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14828,13 +14681,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_643: # %cond.load469
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 119
 ; CHECK-RV64-NEXT:    li a3, 118
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14844,13 +14697,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_644: # %cond.load473
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 120
 ; CHECK-RV64-NEXT:    li a3, 119
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14860,13 +14713,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_645: # %cond.load477
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 121
 ; CHECK-RV64-NEXT:    li a3, 120
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14876,13 +14729,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_646: # %cond.load481
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 122
 ; CHECK-RV64-NEXT:    li a3, 121
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14892,13 +14745,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_647: # %cond.load485
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 123
 ; CHECK-RV64-NEXT:    li a3, 122
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14908,13 +14761,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_648: # %cond.load489
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 124
 ; CHECK-RV64-NEXT:    li a3, 123
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14924,13 +14777,13 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_649: # %cond.load493
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
-; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v24, a2
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    vmv.s.x v10, a2
 ; CHECK-RV64-NEXT:    li a2, 125
 ; CHECK-RV64-NEXT:    li a3, 124
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14941,12 +14794,12 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    j .LBB61_129
 ; CHECK-RV64-NEXT:  .LBB61_650: # %cond.load505
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v24, a1
 ; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vmv.s.x v10, a1
 ; CHECK-RV64-NEXT:    li a1, 128
 ; CHECK-RV64-NEXT:    li a3, 127
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m2, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v10, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
 ; CHECK-RV64-NEXT:    vmv2r.v v16, v8
 ; CHECK-RV64-NEXT:    vmv8r.v v8, v16
@@ -14956,976 +14809,976 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:  .LBB61_651: # %cond.load509
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 129
 ; CHECK-RV64-NEXT:    li a3, 128
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 2
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_652
 ; CHECK-RV64-NEXT:    j .LBB61_134
 ; CHECK-RV64-NEXT:  .LBB61_652: # %cond.load513
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 130
 ; CHECK-RV64-NEXT:    li a3, 129
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 4
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_653
 ; CHECK-RV64-NEXT:    j .LBB61_135
 ; CHECK-RV64-NEXT:  .LBB61_653: # %cond.load517
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 131
 ; CHECK-RV64-NEXT:    li a3, 130
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 8
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_654
 ; CHECK-RV64-NEXT:    j .LBB61_136
 ; CHECK-RV64-NEXT:  .LBB61_654: # %cond.load521
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 132
 ; CHECK-RV64-NEXT:    li a3, 131
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 16
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_655
 ; CHECK-RV64-NEXT:    j .LBB61_137
 ; CHECK-RV64-NEXT:  .LBB61_655: # %cond.load525
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 133
 ; CHECK-RV64-NEXT:    li a3, 132
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 32
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_656
 ; CHECK-RV64-NEXT:    j .LBB61_138
 ; CHECK-RV64-NEXT:  .LBB61_656: # %cond.load529
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 134
 ; CHECK-RV64-NEXT:    li a3, 133
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 64
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_657
 ; CHECK-RV64-NEXT:    j .LBB61_139
 ; CHECK-RV64-NEXT:  .LBB61_657: # %cond.load533
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 135
 ; CHECK-RV64-NEXT:    li a3, 134
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 128
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_658
 ; CHECK-RV64-NEXT:    j .LBB61_140
 ; CHECK-RV64-NEXT:  .LBB61_658: # %cond.load537
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 136
 ; CHECK-RV64-NEXT:    li a3, 135
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 256
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_659
 ; CHECK-RV64-NEXT:    j .LBB61_141
 ; CHECK-RV64-NEXT:  .LBB61_659: # %cond.load541
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 137
 ; CHECK-RV64-NEXT:    li a3, 136
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 512
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_660
 ; CHECK-RV64-NEXT:    j .LBB61_142
 ; CHECK-RV64-NEXT:  .LBB61_660: # %cond.load545
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 138
 ; CHECK-RV64-NEXT:    li a3, 137
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 1024
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_661
 ; CHECK-RV64-NEXT:    j .LBB61_143
 ; CHECK-RV64-NEXT:  .LBB61_661: # %cond.load549
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 139
 ; CHECK-RV64-NEXT:    li a3, 138
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 52
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_662
 ; CHECK-RV64-NEXT:    j .LBB61_144
 ; CHECK-RV64-NEXT:  .LBB61_662: # %cond.load553
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 140
 ; CHECK-RV64-NEXT:    li a3, 139
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 51
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_663
 ; CHECK-RV64-NEXT:    j .LBB61_145
 ; CHECK-RV64-NEXT:  .LBB61_663: # %cond.load557
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 141
 ; CHECK-RV64-NEXT:    li a3, 140
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 50
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_664
 ; CHECK-RV64-NEXT:    j .LBB61_146
 ; CHECK-RV64-NEXT:  .LBB61_664: # %cond.load561
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 142
 ; CHECK-RV64-NEXT:    li a3, 141
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 49
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_665
 ; CHECK-RV64-NEXT:    j .LBB61_147
 ; CHECK-RV64-NEXT:  .LBB61_665: # %cond.load565
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 143
 ; CHECK-RV64-NEXT:    li a3, 142
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 48
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_666
 ; CHECK-RV64-NEXT:    j .LBB61_148
 ; CHECK-RV64-NEXT:  .LBB61_666: # %cond.load569
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 144
 ; CHECK-RV64-NEXT:    li a3, 143
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 47
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_667
 ; CHECK-RV64-NEXT:    j .LBB61_149
 ; CHECK-RV64-NEXT:  .LBB61_667: # %cond.load573
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 145
 ; CHECK-RV64-NEXT:    li a3, 144
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 46
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_668
 ; CHECK-RV64-NEXT:    j .LBB61_150
 ; CHECK-RV64-NEXT:  .LBB61_668: # %cond.load577
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 146
 ; CHECK-RV64-NEXT:    li a3, 145
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 45
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_669
 ; CHECK-RV64-NEXT:    j .LBB61_151
 ; CHECK-RV64-NEXT:  .LBB61_669: # %cond.load581
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 147
 ; CHECK-RV64-NEXT:    li a3, 146
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 44
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_670
 ; CHECK-RV64-NEXT:    j .LBB61_152
 ; CHECK-RV64-NEXT:  .LBB61_670: # %cond.load585
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 148
 ; CHECK-RV64-NEXT:    li a3, 147
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 43
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_671
 ; CHECK-RV64-NEXT:    j .LBB61_153
 ; CHECK-RV64-NEXT:  .LBB61_671: # %cond.load589
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 149
 ; CHECK-RV64-NEXT:    li a3, 148
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 42
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_672
 ; CHECK-RV64-NEXT:    j .LBB61_154
 ; CHECK-RV64-NEXT:  .LBB61_672: # %cond.load593
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 150
 ; CHECK-RV64-NEXT:    li a3, 149
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 41
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_673
 ; CHECK-RV64-NEXT:    j .LBB61_155
 ; CHECK-RV64-NEXT:  .LBB61_673: # %cond.load597
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 151
 ; CHECK-RV64-NEXT:    li a3, 150
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 40
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_674
 ; CHECK-RV64-NEXT:    j .LBB61_156
 ; CHECK-RV64-NEXT:  .LBB61_674: # %cond.load601
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 152
 ; CHECK-RV64-NEXT:    li a3, 151
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 39
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_675
 ; CHECK-RV64-NEXT:    j .LBB61_157
 ; CHECK-RV64-NEXT:  .LBB61_675: # %cond.load605
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 153
 ; CHECK-RV64-NEXT:    li a3, 152
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 38
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_676
 ; CHECK-RV64-NEXT:    j .LBB61_158
 ; CHECK-RV64-NEXT:  .LBB61_676: # %cond.load609
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 154
 ; CHECK-RV64-NEXT:    li a3, 153
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 37
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_677
 ; CHECK-RV64-NEXT:    j .LBB61_159
 ; CHECK-RV64-NEXT:  .LBB61_677: # %cond.load613
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 155
 ; CHECK-RV64-NEXT:    li a3, 154
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 36
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_678
 ; CHECK-RV64-NEXT:    j .LBB61_160
 ; CHECK-RV64-NEXT:  .LBB61_678: # %cond.load617
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 156
 ; CHECK-RV64-NEXT:    li a3, 155
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 35
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_679
 ; CHECK-RV64-NEXT:    j .LBB61_161
 ; CHECK-RV64-NEXT:  .LBB61_679: # %cond.load621
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 157
 ; CHECK-RV64-NEXT:    li a3, 156
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 34
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_680
 ; CHECK-RV64-NEXT:    j .LBB61_162
 ; CHECK-RV64-NEXT:  .LBB61_680: # %cond.load625
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 158
 ; CHECK-RV64-NEXT:    li a3, 157
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 33
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_681
 ; CHECK-RV64-NEXT:    j .LBB61_163
 ; CHECK-RV64-NEXT:  .LBB61_681: # %cond.load629
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 159
 ; CHECK-RV64-NEXT:    li a3, 158
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 32
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_682
 ; CHECK-RV64-NEXT:    j .LBB61_164
 ; CHECK-RV64-NEXT:  .LBB61_682: # %cond.load633
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 160
 ; CHECK-RV64-NEXT:    li a3, 159
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 31
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_683
 ; CHECK-RV64-NEXT:    j .LBB61_165
 ; CHECK-RV64-NEXT:  .LBB61_683: # %cond.load637
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 161
 ; CHECK-RV64-NEXT:    li a3, 160
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 30
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_684
 ; CHECK-RV64-NEXT:    j .LBB61_166
 ; CHECK-RV64-NEXT:  .LBB61_684: # %cond.load641
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 162
 ; CHECK-RV64-NEXT:    li a3, 161
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 29
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_685
 ; CHECK-RV64-NEXT:    j .LBB61_167
 ; CHECK-RV64-NEXT:  .LBB61_685: # %cond.load645
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 163
 ; CHECK-RV64-NEXT:    li a3, 162
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 28
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_686
 ; CHECK-RV64-NEXT:    j .LBB61_168
 ; CHECK-RV64-NEXT:  .LBB61_686: # %cond.load649
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 164
 ; CHECK-RV64-NEXT:    li a3, 163
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 27
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_687
 ; CHECK-RV64-NEXT:    j .LBB61_169
 ; CHECK-RV64-NEXT:  .LBB61_687: # %cond.load653
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 165
 ; CHECK-RV64-NEXT:    li a3, 164
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 26
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_688
 ; CHECK-RV64-NEXT:    j .LBB61_170
 ; CHECK-RV64-NEXT:  .LBB61_688: # %cond.load657
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 166
 ; CHECK-RV64-NEXT:    li a3, 165
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 25
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_689
 ; CHECK-RV64-NEXT:    j .LBB61_171
 ; CHECK-RV64-NEXT:  .LBB61_689: # %cond.load661
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 167
 ; CHECK-RV64-NEXT:    li a3, 166
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 24
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_690
 ; CHECK-RV64-NEXT:    j .LBB61_172
 ; CHECK-RV64-NEXT:  .LBB61_690: # %cond.load665
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 168
 ; CHECK-RV64-NEXT:    li a3, 167
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 23
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_691
 ; CHECK-RV64-NEXT:    j .LBB61_173
 ; CHECK-RV64-NEXT:  .LBB61_691: # %cond.load669
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 169
 ; CHECK-RV64-NEXT:    li a3, 168
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 22
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_692
 ; CHECK-RV64-NEXT:    j .LBB61_174
 ; CHECK-RV64-NEXT:  .LBB61_692: # %cond.load673
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 170
 ; CHECK-RV64-NEXT:    li a3, 169
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 21
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_693
 ; CHECK-RV64-NEXT:    j .LBB61_175
 ; CHECK-RV64-NEXT:  .LBB61_693: # %cond.load677
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 171
 ; CHECK-RV64-NEXT:    li a3, 170
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 20
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_694
 ; CHECK-RV64-NEXT:    j .LBB61_176
 ; CHECK-RV64-NEXT:  .LBB61_694: # %cond.load681
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 172
 ; CHECK-RV64-NEXT:    li a3, 171
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 19
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_695
 ; CHECK-RV64-NEXT:    j .LBB61_177
 ; CHECK-RV64-NEXT:  .LBB61_695: # %cond.load685
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 173
 ; CHECK-RV64-NEXT:    li a3, 172
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 18
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_696
 ; CHECK-RV64-NEXT:    j .LBB61_178
 ; CHECK-RV64-NEXT:  .LBB61_696: # %cond.load689
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 174
 ; CHECK-RV64-NEXT:    li a3, 173
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 17
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_697
 ; CHECK-RV64-NEXT:    j .LBB61_179
 ; CHECK-RV64-NEXT:  .LBB61_697: # %cond.load693
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 175
 ; CHECK-RV64-NEXT:    li a3, 174
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 16
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_698
 ; CHECK-RV64-NEXT:    j .LBB61_180
 ; CHECK-RV64-NEXT:  .LBB61_698: # %cond.load697
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 176
 ; CHECK-RV64-NEXT:    li a3, 175
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 15
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_699
 ; CHECK-RV64-NEXT:    j .LBB61_181
 ; CHECK-RV64-NEXT:  .LBB61_699: # %cond.load701
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 177
 ; CHECK-RV64-NEXT:    li a3, 176
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 14
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_700
 ; CHECK-RV64-NEXT:    j .LBB61_182
 ; CHECK-RV64-NEXT:  .LBB61_700: # %cond.load705
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 178
 ; CHECK-RV64-NEXT:    li a3, 177
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 13
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_701
 ; CHECK-RV64-NEXT:    j .LBB61_183
 ; CHECK-RV64-NEXT:  .LBB61_701: # %cond.load709
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 179
 ; CHECK-RV64-NEXT:    li a3, 178
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 12
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_702
 ; CHECK-RV64-NEXT:    j .LBB61_184
 ; CHECK-RV64-NEXT:  .LBB61_702: # %cond.load713
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 180
 ; CHECK-RV64-NEXT:    li a3, 179
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 11
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_703
 ; CHECK-RV64-NEXT:    j .LBB61_185
 ; CHECK-RV64-NEXT:  .LBB61_703: # %cond.load717
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 181
 ; CHECK-RV64-NEXT:    li a3, 180
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 10
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_704
 ; CHECK-RV64-NEXT:    j .LBB61_186
 ; CHECK-RV64-NEXT:  .LBB61_704: # %cond.load721
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 182
 ; CHECK-RV64-NEXT:    li a3, 181
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 9
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_705
 ; CHECK-RV64-NEXT:    j .LBB61_187
 ; CHECK-RV64-NEXT:  .LBB61_705: # %cond.load725
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 183
 ; CHECK-RV64-NEXT:    li a3, 182
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 8
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_706
 ; CHECK-RV64-NEXT:    j .LBB61_188
 ; CHECK-RV64-NEXT:  .LBB61_706: # %cond.load729
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 184
 ; CHECK-RV64-NEXT:    li a3, 183
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 7
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_707
 ; CHECK-RV64-NEXT:    j .LBB61_189
 ; CHECK-RV64-NEXT:  .LBB61_707: # %cond.load733
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 185
 ; CHECK-RV64-NEXT:    li a3, 184
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 6
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_708
 ; CHECK-RV64-NEXT:    j .LBB61_190
 ; CHECK-RV64-NEXT:  .LBB61_708: # %cond.load737
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 186
 ; CHECK-RV64-NEXT:    li a3, 185
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 5
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_709
 ; CHECK-RV64-NEXT:    j .LBB61_191
 ; CHECK-RV64-NEXT:  .LBB61_709: # %cond.load741
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 187
 ; CHECK-RV64-NEXT:    li a3, 186
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 4
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_710
 ; CHECK-RV64-NEXT:    j .LBB61_192
 ; CHECK-RV64-NEXT:  .LBB61_710: # %cond.load745
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 188
 ; CHECK-RV64-NEXT:    li a3, 187
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 3
 ; CHECK-RV64-NEXT:    bltz a1, .LBB61_711
 ; CHECK-RV64-NEXT:    j .LBB61_193
 ; CHECK-RV64-NEXT:  .LBB61_711: # %cond.load749
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 189
 ; CHECK-RV64-NEXT:    li a3, 188
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a1, a2, 2
 ; CHECK-RV64-NEXT:    bgez a1, .LBB61_1027
 ; CHECK-RV64-NEXT:    j .LBB61_194
@@ -15933,991 +15786,991 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    j .LBB61_195
 ; CHECK-RV64-NEXT:  .LBB61_712: # %cond.load761
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 192
 ; CHECK-RV64-NEXT:    li a3, 191
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 1
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_713
 ; CHECK-RV64-NEXT:    j .LBB61_199
 ; CHECK-RV64-NEXT:  .LBB61_713: # %cond.load765
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 193
 ; CHECK-RV64-NEXT:    li a3, 192
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 2
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_714
 ; CHECK-RV64-NEXT:    j .LBB61_200
 ; CHECK-RV64-NEXT:  .LBB61_714: # %cond.load769
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 194
 ; CHECK-RV64-NEXT:    li a3, 193
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 4
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_715
 ; CHECK-RV64-NEXT:    j .LBB61_201
 ; CHECK-RV64-NEXT:  .LBB61_715: # %cond.load773
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 195
 ; CHECK-RV64-NEXT:    li a3, 194
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 8
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_716
 ; CHECK-RV64-NEXT:    j .LBB61_202
 ; CHECK-RV64-NEXT:  .LBB61_716: # %cond.load777
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 196
 ; CHECK-RV64-NEXT:    li a3, 195
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 16
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_717
 ; CHECK-RV64-NEXT:    j .LBB61_203
 ; CHECK-RV64-NEXT:  .LBB61_717: # %cond.load781
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 197
 ; CHECK-RV64-NEXT:    li a3, 196
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 32
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_718
 ; CHECK-RV64-NEXT:    j .LBB61_204
 ; CHECK-RV64-NEXT:  .LBB61_718: # %cond.load785
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 198
 ; CHECK-RV64-NEXT:    li a3, 197
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 64
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_719
 ; CHECK-RV64-NEXT:    j .LBB61_205
 ; CHECK-RV64-NEXT:  .LBB61_719: # %cond.load789
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 199
 ; CHECK-RV64-NEXT:    li a3, 198
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 128
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_720
 ; CHECK-RV64-NEXT:    j .LBB61_206
 ; CHECK-RV64-NEXT:  .LBB61_720: # %cond.load793
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 200
 ; CHECK-RV64-NEXT:    li a3, 199
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 256
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_721
 ; CHECK-RV64-NEXT:    j .LBB61_207
 ; CHECK-RV64-NEXT:  .LBB61_721: # %cond.load797
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 201
 ; CHECK-RV64-NEXT:    li a3, 200
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 512
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_722
 ; CHECK-RV64-NEXT:    j .LBB61_208
 ; CHECK-RV64-NEXT:  .LBB61_722: # %cond.load801
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 202
 ; CHECK-RV64-NEXT:    li a3, 201
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a2, a1, 1024
 ; CHECK-RV64-NEXT:    bnez a2, .LBB61_723
 ; CHECK-RV64-NEXT:    j .LBB61_209
 ; CHECK-RV64-NEXT:  .LBB61_723: # %cond.load805
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 203
 ; CHECK-RV64-NEXT:    li a3, 202
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 52
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_724
 ; CHECK-RV64-NEXT:    j .LBB61_210
 ; CHECK-RV64-NEXT:  .LBB61_724: # %cond.load809
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 204
 ; CHECK-RV64-NEXT:    li a3, 203
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 51
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_725
 ; CHECK-RV64-NEXT:    j .LBB61_211
 ; CHECK-RV64-NEXT:  .LBB61_725: # %cond.load813
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 205
 ; CHECK-RV64-NEXT:    li a3, 204
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 50
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_726
 ; CHECK-RV64-NEXT:    j .LBB61_212
 ; CHECK-RV64-NEXT:  .LBB61_726: # %cond.load817
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 206
 ; CHECK-RV64-NEXT:    li a3, 205
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 49
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_727
 ; CHECK-RV64-NEXT:    j .LBB61_213
 ; CHECK-RV64-NEXT:  .LBB61_727: # %cond.load821
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 207
 ; CHECK-RV64-NEXT:    li a3, 206
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 48
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_728
 ; CHECK-RV64-NEXT:    j .LBB61_214
 ; CHECK-RV64-NEXT:  .LBB61_728: # %cond.load825
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 208
 ; CHECK-RV64-NEXT:    li a3, 207
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 47
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_729
 ; CHECK-RV64-NEXT:    j .LBB61_215
 ; CHECK-RV64-NEXT:  .LBB61_729: # %cond.load829
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 209
 ; CHECK-RV64-NEXT:    li a3, 208
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 46
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_730
 ; CHECK-RV64-NEXT:    j .LBB61_216
 ; CHECK-RV64-NEXT:  .LBB61_730: # %cond.load833
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 210
 ; CHECK-RV64-NEXT:    li a3, 209
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 45
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_731
 ; CHECK-RV64-NEXT:    j .LBB61_217
 ; CHECK-RV64-NEXT:  .LBB61_731: # %cond.load837
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 211
 ; CHECK-RV64-NEXT:    li a3, 210
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 44
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_732
 ; CHECK-RV64-NEXT:    j .LBB61_218
 ; CHECK-RV64-NEXT:  .LBB61_732: # %cond.load841
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 212
 ; CHECK-RV64-NEXT:    li a3, 211
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 43
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_733
 ; CHECK-RV64-NEXT:    j .LBB61_219
 ; CHECK-RV64-NEXT:  .LBB61_733: # %cond.load845
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 213
 ; CHECK-RV64-NEXT:    li a3, 212
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 42
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_734
 ; CHECK-RV64-NEXT:    j .LBB61_220
 ; CHECK-RV64-NEXT:  .LBB61_734: # %cond.load849
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 214
 ; CHECK-RV64-NEXT:    li a3, 213
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 41
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_735
 ; CHECK-RV64-NEXT:    j .LBB61_221
 ; CHECK-RV64-NEXT:  .LBB61_735: # %cond.load853
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 215
 ; CHECK-RV64-NEXT:    li a3, 214
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 40
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_736
 ; CHECK-RV64-NEXT:    j .LBB61_222
 ; CHECK-RV64-NEXT:  .LBB61_736: # %cond.load857
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 216
 ; CHECK-RV64-NEXT:    li a3, 215
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 39
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_737
 ; CHECK-RV64-NEXT:    j .LBB61_223
 ; CHECK-RV64-NEXT:  .LBB61_737: # %cond.load861
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 217
 ; CHECK-RV64-NEXT:    li a3, 216
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 38
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_738
 ; CHECK-RV64-NEXT:    j .LBB61_224
 ; CHECK-RV64-NEXT:  .LBB61_738: # %cond.load865
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 218
 ; CHECK-RV64-NEXT:    li a3, 217
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 37
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_739
 ; CHECK-RV64-NEXT:    j .LBB61_225
 ; CHECK-RV64-NEXT:  .LBB61_739: # %cond.load869
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 219
 ; CHECK-RV64-NEXT:    li a3, 218
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 36
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_740
 ; CHECK-RV64-NEXT:    j .LBB61_226
 ; CHECK-RV64-NEXT:  .LBB61_740: # %cond.load873
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 220
 ; CHECK-RV64-NEXT:    li a3, 219
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 35
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_741
 ; CHECK-RV64-NEXT:    j .LBB61_227
 ; CHECK-RV64-NEXT:  .LBB61_741: # %cond.load877
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 221
 ; CHECK-RV64-NEXT:    li a3, 220
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 34
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_742
 ; CHECK-RV64-NEXT:    j .LBB61_228
 ; CHECK-RV64-NEXT:  .LBB61_742: # %cond.load881
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 222
 ; CHECK-RV64-NEXT:    li a3, 221
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 33
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_743
 ; CHECK-RV64-NEXT:    j .LBB61_229
 ; CHECK-RV64-NEXT:  .LBB61_743: # %cond.load885
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 223
 ; CHECK-RV64-NEXT:    li a3, 222
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 32
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_744
 ; CHECK-RV64-NEXT:    j .LBB61_230
 ; CHECK-RV64-NEXT:  .LBB61_744: # %cond.load889
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 224
 ; CHECK-RV64-NEXT:    li a3, 223
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 31
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_745
 ; CHECK-RV64-NEXT:    j .LBB61_231
 ; CHECK-RV64-NEXT:  .LBB61_745: # %cond.load893
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 225
 ; CHECK-RV64-NEXT:    li a3, 224
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 30
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_746
 ; CHECK-RV64-NEXT:    j .LBB61_232
 ; CHECK-RV64-NEXT:  .LBB61_746: # %cond.load897
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 226
 ; CHECK-RV64-NEXT:    li a3, 225
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 29
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_747
 ; CHECK-RV64-NEXT:    j .LBB61_233
 ; CHECK-RV64-NEXT:  .LBB61_747: # %cond.load901
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 227
 ; CHECK-RV64-NEXT:    li a3, 226
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 28
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_748
 ; CHECK-RV64-NEXT:    j .LBB61_234
 ; CHECK-RV64-NEXT:  .LBB61_748: # %cond.load905
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 228
 ; CHECK-RV64-NEXT:    li a3, 227
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 27
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_749
 ; CHECK-RV64-NEXT:    j .LBB61_235
 ; CHECK-RV64-NEXT:  .LBB61_749: # %cond.load909
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 229
 ; CHECK-RV64-NEXT:    li a3, 228
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 26
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_750
 ; CHECK-RV64-NEXT:    j .LBB61_236
 ; CHECK-RV64-NEXT:  .LBB61_750: # %cond.load913
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 230
 ; CHECK-RV64-NEXT:    li a3, 229
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 25
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_751
 ; CHECK-RV64-NEXT:    j .LBB61_237
 ; CHECK-RV64-NEXT:  .LBB61_751: # %cond.load917
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 231
 ; CHECK-RV64-NEXT:    li a3, 230
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 24
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_752
 ; CHECK-RV64-NEXT:    j .LBB61_238
 ; CHECK-RV64-NEXT:  .LBB61_752: # %cond.load921
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 232
 ; CHECK-RV64-NEXT:    li a3, 231
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 23
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_753
 ; CHECK-RV64-NEXT:    j .LBB61_239
 ; CHECK-RV64-NEXT:  .LBB61_753: # %cond.load925
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 233
 ; CHECK-RV64-NEXT:    li a3, 232
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 22
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_754
 ; CHECK-RV64-NEXT:    j .LBB61_240
 ; CHECK-RV64-NEXT:  .LBB61_754: # %cond.load929
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 234
 ; CHECK-RV64-NEXT:    li a3, 233
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 21
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_755
 ; CHECK-RV64-NEXT:    j .LBB61_241
 ; CHECK-RV64-NEXT:  .LBB61_755: # %cond.load933
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 235
 ; CHECK-RV64-NEXT:    li a3, 234
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 20
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_756
 ; CHECK-RV64-NEXT:    j .LBB61_242
 ; CHECK-RV64-NEXT:  .LBB61_756: # %cond.load937
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 236
 ; CHECK-RV64-NEXT:    li a3, 235
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 19
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_757
 ; CHECK-RV64-NEXT:    j .LBB61_243
 ; CHECK-RV64-NEXT:  .LBB61_757: # %cond.load941
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 237
 ; CHECK-RV64-NEXT:    li a3, 236
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 18
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_758
 ; CHECK-RV64-NEXT:    j .LBB61_244
 ; CHECK-RV64-NEXT:  .LBB61_758: # %cond.load945
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 238
 ; CHECK-RV64-NEXT:    li a3, 237
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 17
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_759
 ; CHECK-RV64-NEXT:    j .LBB61_245
 ; CHECK-RV64-NEXT:  .LBB61_759: # %cond.load949
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 239
 ; CHECK-RV64-NEXT:    li a3, 238
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 16
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_760
 ; CHECK-RV64-NEXT:    j .LBB61_246
 ; CHECK-RV64-NEXT:  .LBB61_760: # %cond.load953
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 240
 ; CHECK-RV64-NEXT:    li a3, 239
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 15
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_761
 ; CHECK-RV64-NEXT:    j .LBB61_247
 ; CHECK-RV64-NEXT:  .LBB61_761: # %cond.load957
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 241
 ; CHECK-RV64-NEXT:    li a3, 240
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 14
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_762
 ; CHECK-RV64-NEXT:    j .LBB61_248
 ; CHECK-RV64-NEXT:  .LBB61_762: # %cond.load961
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 242
 ; CHECK-RV64-NEXT:    li a3, 241
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 13
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_763
 ; CHECK-RV64-NEXT:    j .LBB61_249
 ; CHECK-RV64-NEXT:  .LBB61_763: # %cond.load965
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 243
 ; CHECK-RV64-NEXT:    li a3, 242
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 12
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_764
 ; CHECK-RV64-NEXT:    j .LBB61_250
 ; CHECK-RV64-NEXT:  .LBB61_764: # %cond.load969
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 244
 ; CHECK-RV64-NEXT:    li a3, 243
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 11
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_765
 ; CHECK-RV64-NEXT:    j .LBB61_251
 ; CHECK-RV64-NEXT:  .LBB61_765: # %cond.load973
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 245
 ; CHECK-RV64-NEXT:    li a3, 244
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 10
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_766
 ; CHECK-RV64-NEXT:    j .LBB61_252
 ; CHECK-RV64-NEXT:  .LBB61_766: # %cond.load977
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 246
 ; CHECK-RV64-NEXT:    li a3, 245
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 9
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_767
 ; CHECK-RV64-NEXT:    j .LBB61_253
 ; CHECK-RV64-NEXT:  .LBB61_767: # %cond.load981
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 247
 ; CHECK-RV64-NEXT:    li a3, 246
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 8
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_768
 ; CHECK-RV64-NEXT:    j .LBB61_254
 ; CHECK-RV64-NEXT:  .LBB61_768: # %cond.load985
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 248
 ; CHECK-RV64-NEXT:    li a3, 247
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 7
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_769
 ; CHECK-RV64-NEXT:    j .LBB61_255
 ; CHECK-RV64-NEXT:  .LBB61_769: # %cond.load989
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 249
 ; CHECK-RV64-NEXT:    li a3, 248
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 6
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_770
 ; CHECK-RV64-NEXT:    j .LBB61_256
 ; CHECK-RV64-NEXT:  .LBB61_770: # %cond.load993
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 250
 ; CHECK-RV64-NEXT:    li a3, 249
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 5
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_771
 ; CHECK-RV64-NEXT:    j .LBB61_257
 ; CHECK-RV64-NEXT:  .LBB61_771: # %cond.load997
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 251
 ; CHECK-RV64-NEXT:    li a3, 250
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 4
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_772
 ; CHECK-RV64-NEXT:    j .LBB61_258
 ; CHECK-RV64-NEXT:  .LBB61_772: # %cond.load1001
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 252
 ; CHECK-RV64-NEXT:    li a3, 251
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 3
 ; CHECK-RV64-NEXT:    bltz a2, .LBB61_773
 ; CHECK-RV64-NEXT:    j .LBB61_259
 ; CHECK-RV64-NEXT:  .LBB61_773: # %cond.load1005
 ; CHECK-RV64-NEXT:    lbu a2, 0(a0)
 ; CHECK-RV64-NEXT:    li a3, 512
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.s.x v16, a2
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a2
 ; CHECK-RV64-NEXT:    li a2, 253
 ; CHECK-RV64-NEXT:    li a3, 252
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    slli a2, a1, 2
 ; CHECK-RV64-NEXT:    bgez a2, .LBB61_1028
 ; CHECK-RV64-NEXT:    j .LBB61_260
@@ -16925,15 +16778,15 @@ define <512 x i8> @test_expandload_v512i8_vlen512(ptr %base, <512 x i1> %mask, <
 ; CHECK-RV64-NEXT:    j .LBB61_261
 ; CHECK-RV64-NEXT:  .LBB61_774: # %cond.load1017
 ; CHECK-RV64-NEXT:    lbu a1, 0(a0)
-; CHECK-RV64-NEXT:    vmv.s.x v16, a1
-; CHECK-RV64-NEXT:    vmv8r.v v24, v8
+; CHECK-RV64-NEXT:    vmv8r.v v16, v8
+; CHECK-RV64-NEXT:    vmv.s.x v12, a1
 ; CHECK-RV64-NEXT:    li a1, 256
 ; CHECK-RV64-NEXT:    li a3, 255
 ; CHECK-RV64-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
-; CHECK-RV64-NEXT:    vslideup.vx v8, v16, a3
+; CHECK-RV64-NEXT:    vslideup.vx v8, v12, a3
 ; CHECK-RV64-NEXT:    addi a0, a0, 1
-; CHECK-RV64-NEXT:    vmv4r.v v24, v8
-; CHECK-RV64-NEXT:    vmv8r.v v8, v24
+; CHECK-RV64-NEXT:    vmv4r.v v16, v8
+; CHECK-RV64-NEXT:    vmv8r.v v8, v16
 ; CHECK-RV64-NEXT:    andi a1, a2, 1
 ; CHECK-RV64-NEXT:    bnez a1, .LBB61_775
 ; CHECK-RV64-NEXT:    j .LBB61_265

diff  --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index 8e9751502460ec..869478a1efa78d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -396,8 +396,8 @@ define <vscale x 2 x i1> @extract_nxv64i1_nxv2i1_2(<vscale x 64 x i1> %mask) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
@@ -421,8 +421,8 @@ define <vscale x 4 x i1> @extract_nxv4i1_nxv32i1_4(<vscale x 32 x i1> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
index 65f22370d729a0..d60ce408278da3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-fp.ll
@@ -1216,8 +1216,8 @@ define float @extractelt_fadd_nxv4f32_splat(<vscale x 4 x float> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    lui a0, 263168
+; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    fmv.w.x fa4, a0
 ; CHECK-NEXT:    fadd.s fa0, fa5, fa4
 ; CHECK-NEXT:    ret
@@ -1231,8 +1231,8 @@ define float @extractelt_fsub_nxv4f32_splat(<vscale x 4 x float> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    lui a0, 263168
+; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    fmv.w.x fa4, a0
 ; CHECK-NEXT:    fsub.s fa0, fa4, fa5
 ; CHECK-NEXT:    ret
@@ -1246,8 +1246,8 @@ define float @extractelt_fmul_nxv4f32_splat(<vscale x 4 x float> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 3
-; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    lui a0, 263168
+; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    fmv.w.x fa4, a0
 ; CHECK-NEXT:    fmul.s fa0, fa5, fa4
 ; CHECK-NEXT:    ret
@@ -1296,12 +1296,12 @@ define double @extractelt_nxv16f64_neg1(<vscale x 16 x double> %v) {
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    addi a0, sp, 64
-; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    slli a2, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    vs8r.v v16, (a2)
-; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    fld fa0, -8(a0)
 ; RV32-NEXT:    addi sp, s0, -80
@@ -1329,13 +1329,13 @@ define double @extractelt_nxv16f64_neg1(<vscale x 16 x double> %v) {
 ; RV64-NEXT:    sub sp, sp, a0
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    addi a0, sp, 64
-; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a1, a2, 3
-; RV64-NEXT:    add a3, a0, a1
 ; RV64-NEXT:    li a1, -1
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    slli a3, a2, 3
 ; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    slli a2, a2, 1
+; RV64-NEXT:    add a3, a0, a3
 ; RV64-NEXT:    addi a2, a2, -1
 ; RV64-NEXT:    vs8r.v v16, (a3)
 ; RV64-NEXT:    bltu a2, a1, .LBB70_2
@@ -1393,9 +1393,9 @@ define double @extractelt_nxv16f64_idx(<vscale x 16 x double> %v, i32 zeroext %i
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    addi a2, sp, 64
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a0, a2, a0
 ; RV32-NEXT:    vs8r.v v8, (a2)
-; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    vs8r.v v16, (a1)
 ; RV32-NEXT:    fld fa0, 0(a0)
@@ -1432,9 +1432,9 @@ define double @extractelt_nxv16f64_idx(<vscale x 16 x double> %v, i32 zeroext %i
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    slli a0, a0, 3
 ; RV64-NEXT:    addi a2, sp, 64
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a0, a2, a0
 ; RV64-NEXT:    vs8r.v v8, (a2)
-; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    vs8r.v v16, (a1)
 ; RV64-NEXT:    fld fa0, 0(a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
index 14719e190a6934..796f8dde58f479 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll
@@ -139,22 +139,22 @@ define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind {
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    sub sp, sp, a3
 ; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    addi a3, sp, 64
+; RV32-NEXT:    vl8r.v v8, (a0)
 ; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a3, a0, a2
-; RV32-NEXT:    vl8r.v v16, (a3)
+; RV32-NEXT:    add a0, a0, a2
 ; RV32-NEXT:    vl8r.v v24, (a0)
-; RV32-NEXT:    addi a0, sp, 64
-; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    vsetvli a3, zero, e8, m8, ta, ma
-; RV32-NEXT:    vmseq.vi v8, v16, 0
-; RV32-NEXT:    vmseq.vi v0, v24, 0
+; RV32-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; RV32-NEXT:    vmseq.vi v0, v8, 0
 ; RV32-NEXT:    vmv.v.i v16, 0
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    vmseq.vi v8, v24, 0
 ; RV32-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV32-NEXT:    vs8r.v v24, (a0)
-; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    vs8r.v v24, (a3)
 ; RV32-NEXT:    vmv1r.v v0, v8
 ; RV32-NEXT:    vmerge.vim v8, v16, 1, v0
-; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    vs8r.v v8, (a2)
 ; RV32-NEXT:    lbu a0, 0(a1)
 ; RV32-NEXT:    addi sp, s0, -80
 ; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
@@ -179,22 +179,22 @@ define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind {
 ; RV64-NEXT:    slli a3, a3, 4
 ; RV64-NEXT:    sub sp, sp, a3
 ; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    addi a3, sp, 64
+; RV64-NEXT:    vl8r.v v8, (a0)
 ; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a3, a0, a2
-; RV64-NEXT:    vl8r.v v16, (a3)
+; RV64-NEXT:    add a0, a0, a2
 ; RV64-NEXT:    vl8r.v v24, (a0)
-; RV64-NEXT:    addi a0, sp, 64
-; RV64-NEXT:    add a1, a0, a1
-; RV64-NEXT:    vsetvli a3, zero, e8, m8, ta, ma
-; RV64-NEXT:    vmseq.vi v8, v16, 0
-; RV64-NEXT:    vmseq.vi v0, v24, 0
+; RV64-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; RV64-NEXT:    vmseq.vi v0, v8, 0
 ; RV64-NEXT:    vmv.v.i v16, 0
+; RV64-NEXT:    add a1, a3, a1
+; RV64-NEXT:    add a2, a3, a2
+; RV64-NEXT:    vmseq.vi v8, v24, 0
 ; RV64-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV64-NEXT:    vs8r.v v24, (a0)
-; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    vs8r.v v24, (a3)
 ; RV64-NEXT:    vmv1r.v v0, v8
 ; RV64-NEXT:    vmerge.vim v8, v16, 1, v0
-; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    vs8r.v v8, (a2)
 ; RV64-NEXT:    lbu a0, 0(a1)
 ; RV64-NEXT:    addi sp, s0, -80
 ; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
index e6263ec9f00047..1474c73dacfc89 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv32.ll
@@ -613,8 +613,8 @@ define i64 @extractelt_nxv1i64_idx(<vscale x 1 x i64> %v, i32 %idx) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    vmv.x.s a1, v8
 ; CHECK-NEXT:    ret
@@ -654,8 +654,8 @@ define i64 @extractelt_nxv2i64_idx(<vscale x 2 x i64> %v, i32 %idx) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    vmv.x.s a1, v8
 ; CHECK-NEXT:    ret
@@ -695,8 +695,8 @@ define i64 @extractelt_nxv4i64_idx(<vscale x 4 x i64> %v, i32 %idx) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    vmv.x.s a1, v8
 ; CHECK-NEXT:    ret
@@ -736,8 +736,8 @@ define i64 @extractelt_nxv8i64_idx(<vscale x 8 x i64> %v, i32 %idx) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    vmv.x.s a1, v8
 ; CHECK-NEXT:    ret
@@ -876,12 +876,12 @@ define i32 @extractelt_nxv32i32_neg1(<vscale x 32 x i32> %v) {
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    andi sp, sp, -64
 ; CHECK-NEXT:    addi a0, sp, 64
-; CHECK-NEXT:    vs8r.v v8, (a0)
 ; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    vs8r.v v8, (a0)
 ; CHECK-NEXT:    slli a2, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    vs8r.v v16, (a2)
-; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    lw a0, -4(a0)
 ; CHECK-NEXT:    addi sp, s0, -80
@@ -932,9 +932,9 @@ define i32 @extractelt_nxv32i32_idx(<vscale x 32 x i32> %v, i32 %idx) {
 ; CHECK-NEXT:    andi sp, sp, -64
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    addi a2, sp, 64
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a0, a2, a0
 ; CHECK-NEXT:    vs8r.v v8, (a2)
-; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    vs8r.v v16, (a1)
 ; CHECK-NEXT:    lw a0, 0(a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
index d5c2b9e4842068..a9e129ef11a2cf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-int-rv64.ll
@@ -862,13 +862,13 @@ define i64 @extractelt_nxv16i64_neg1(<vscale x 16 x i64> %v) {
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    andi sp, sp, -64
 ; CHECK-NEXT:    addi a0, sp, 64
-; CHECK-NEXT:    vs8r.v v8, (a0)
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a1, a2, 3
-; CHECK-NEXT:    add a3, a0, a1
 ; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    slli a3, a2, 3
 ; CHECK-NEXT:    srli a1, a1, 32
 ; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    vs8r.v v16, (a3)
 ; CHECK-NEXT:    bltu a2, a1, .LBB74_2
@@ -926,9 +926,9 @@ define i64 @extractelt_nxv16i64_idx(<vscale x 16 x i64> %v, i32 zeroext %idx) {
 ; CHECK-NEXT:    andi sp, sp, -64
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    addi a2, sp, 64
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a0, a2, a0
 ; CHECK-NEXT:    vs8r.v v8, (a2)
-; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    vs8r.v v16, (a1)
 ; CHECK-NEXT:    ld a0, 0(a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
index 4e549a5aa7c3a5..1626b362fed157 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll
@@ -147,10 +147,10 @@ define <vscale x 1 x float> @ceil_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -170,10 +170,10 @@ define <vscale x 2 x float> @ceil_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -193,10 +193,10 @@ define <vscale x 4 x float> @ceil_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -216,10 +216,10 @@ define <vscale x 8 x float> @ceil_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -239,10 +239,10 @@ define <vscale x 16 x float> @ceil_nxv16f32(<vscale x 16 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
index 35936574e8fe21..4aca2d694dfbbf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll
@@ -17,9 +17,9 @@ define <vscale x 1 x bfloat> @ceil_nxv1bf16(<vscale x 1 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -40,9 +40,9 @@ define <vscale x 2 x bfloat> @ceil_nxv2bf16(<vscale x 2 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -63,9 +63,9 @@ define <vscale x 4 x bfloat> @ceil_nxv4bf16(<vscale x 4 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -86,9 +86,9 @@ define <vscale x 8 x bfloat> @ceil_nxv8bf16(<vscale x 8 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -109,9 +109,9 @@ define <vscale x 16 x bfloat> @ceil_nxv16bf16(<vscale x 16 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -132,9 +132,9 @@ define <vscale x 32 x bfloat> @ceil_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
@@ -144,19 +144,21 @@ define <vscale x 32 x bfloat> @ceil_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
+; CHECK-NEXT:    vfabs.v v8, v24
+; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
 ; CHECK-NEXT:    ret
   %a = call <vscale x 32 x bfloat> @llvm.ceil.nxv32bf16(<vscale x 32 x bfloat> %x)
   ret <vscale x 32 x bfloat> %a
@@ -182,9 +184,9 @@ define <vscale x 1 x half> @ceil_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -221,9 +223,9 @@ define <vscale x 2 x half> @ceil_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -260,9 +262,9 @@ define <vscale x 4 x half> @ceil_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -299,9 +301,9 @@ define <vscale x 8 x half> @ceil_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -338,9 +340,9 @@ define <vscale x 16 x half> @ceil_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -377,9 +379,9 @@ define <vscale x 32 x half> @ceil_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -389,19 +391,21 @@ define <vscale x 32 x half> @ceil_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16
-; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
+; ZVFHMIN-NEXT:    vfabs.v v8, v24
+; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    ret
   %a = call <vscale x 32 x half> @llvm.ceil.nxv32f16(<vscale x 32 x half> %x)
   ret <vscale x 32 x half> %a

diff  --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
index f6b47743d1154c..d93f15ec440530 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll
@@ -147,10 +147,10 @@ define <vscale x 1 x float> @floor_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -170,10 +170,10 @@ define <vscale x 2 x float> @floor_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -193,10 +193,10 @@ define <vscale x 4 x float> @floor_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -216,10 +216,10 @@ define <vscale x 8 x float> @floor_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -239,10 +239,10 @@ define <vscale x 16 x float> @floor_nxv16f32(<vscale x 16 x float> %x) strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
index d26b74c7c139ec..010d7786c8891c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll
@@ -17,9 +17,9 @@ define <vscale x 1 x bfloat> @floor_nxv1bf16(<vscale x 1 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -41,9 +41,9 @@ define <vscale x 2 x bfloat> @floor_nxv2bf16(<vscale x 2 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -65,9 +65,9 @@ define <vscale x 4 x bfloat> @floor_nxv4bf16(<vscale x 4 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -89,9 +89,9 @@ define <vscale x 8 x bfloat> @floor_nxv8bf16(<vscale x 8 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -113,9 +113,9 @@ define <vscale x 16 x bfloat> @floor_nxv16bf16(<vscale x 16 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -137,9 +137,9 @@ define <vscale x 32 x bfloat> @floor_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -149,19 +149,21 @@ define <vscale x 32 x bfloat> @floor_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
+; CHECK-NEXT:    vfabs.v v8, v24
+; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
 ; CHECK-NEXT:    ret
   %a = call <vscale x 32 x bfloat> @llvm.floor.nxv32bf16(<vscale x 32 x bfloat> %x)
   ret <vscale x 32 x bfloat> %a
@@ -188,9 +190,9 @@ define <vscale x 1 x half> @floor_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -227,9 +229,9 @@ define <vscale x 2 x half> @floor_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -266,9 +268,9 @@ define <vscale x 4 x half> @floor_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -305,9 +307,9 @@ define <vscale x 8 x half> @floor_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -344,9 +346,9 @@ define <vscale x 16 x half> @floor_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -383,9 +385,9 @@ define <vscale x 32 x half> @floor_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -395,19 +397,21 @@ define <vscale x 32 x half> @floor_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16
-; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
+; ZVFHMIN-NEXT:    vfabs.v v8, v24
+; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    ret
   %a = call <vscale x 32 x half> @llvm.floor.nxv32f16(<vscale x 32 x half> %x)
   ret <vscale x 32 x half> %a

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll
index a4e7bb2f31048c..ce83e2d8a62206 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll
@@ -19,30 +19,30 @@ define <512 x i8> @single_source(<512 x i8> %a) {
 ; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    li a0, 512
 ; CHECK-NEXT:    addi a1, sp, 512
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.x.s a2, v16
+; CHECK-NEXT:    vslidedown.vi v24, v16, 5
+; CHECK-NEXT:    li a3, 432
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    vmv.v.x v8, a2
 ; CHECK-NEXT:    lbu a0, 770(sp)
-; CHECK-NEXT:    vmv.x.s a1, v16
-; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    vslide1down.vx v8, v8, a0
-; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 5
-; CHECK-NEXT:    li a0, 432
 ; CHECK-NEXT:    li a1, 431
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, tu, ma
+; CHECK-NEXT:    vslide1down.vx v8, v8, a0
+; CHECK-NEXT:    lbu a0, 1012(sp)
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v24, a1
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v16, 4
-; CHECK-NEXT:    li a0, 466
-; CHECK-NEXT:    li a1, 465
-; CHECK-NEXT:    lbu a2, 1012(sp)
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v16, a1
-; CHECK-NEXT:    vmv.s.x v16, a2
-; CHECK-NEXT:    li a0, 501
-; CHECK-NEXT:    li a1, 500
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v16, a1
+; CHECK-NEXT:    vslidedown.vi v24, v16, 4
+; CHECK-NEXT:    li a1, 466
+; CHECK-NEXT:    vmv.s.x v16, a0
+; CHECK-NEXT:    li a0, 465
+; CHECK-NEXT:    li a2, 501
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
+; CHECK-NEXT:    vslideup.vx v8, v24, a0
+; CHECK-NEXT:    li a0, 500
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
+; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    addi sp, s0, -1536
 ; CHECK-NEXT:    .cfi_def_cfa sp, 1536
 ; CHECK-NEXT:    ld ra, 1528(sp) # 8-byte Folded Reload
@@ -61,28 +61,28 @@ define <512 x i8> @range_restriction(<512 x i8> %a) {
 ; CHECK-LABEL: range_restriction:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 512
+; CHECK-NEXT:    li a1, 254
+; CHECK-NEXT:    li a2, 432
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    li a1, 254
 ; CHECK-NEXT:    vslide1down.vx v24, v16, a1
+; CHECK-NEXT:    li a1, 431
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 5
-; CHECK-NEXT:    li a1, 432
-; CHECK-NEXT:    li a2, 431
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v24, v16, a2
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v16, 4
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
+; CHECK-NEXT:    vslideup.vx v24, v16, a1
 ; CHECK-NEXT:    li a1, 466
 ; CHECK-NEXT:    li a2, 465
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 4
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
 ; CHECK-NEXT:    vslideup.vx v24, v16, a2
 ; CHECK-NEXT:    li a1, 44
+; CHECK-NEXT:    li a2, 501
 ; CHECK-NEXT:    vmv.s.x v16, a1
-; CHECK-NEXT:    li a1, 501
-; CHECK-NEXT:    li a2, 500
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v24, v16, a2
+; CHECK-NEXT:    li a1, 500
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
+; CHECK-NEXT:    vslideup.vx v24, v16, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vrgather.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16
@@ -107,53 +107,53 @@ define <512 x i8> @two_source(<512 x i8> %a, <512 x i8> %b) {
 ; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    li a0, 512
 ; CHECK-NEXT:    addi a1, sp, 512
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vse8.v v8, (a1)
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v24, 5
-; CHECK-NEXT:    vmv.x.s a1, v24
+; CHECK-NEXT:    vmv.x.s a2, v24
+; CHECK-NEXT:    li a3, 432
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a1
-; CHECK-NEXT:    li a1, 432
+; CHECK-NEXT:    vmv.v.x v8, a2
 ; CHECK-NEXT:    li a2, 431
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v0, a2
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v24, 4
-; CHECK-NEXT:    li a1, 466
-; CHECK-NEXT:    li a2, 465
-; CHECK-NEXT:    lbu a3, 985(sp)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v24, a2
-; CHECK-NEXT:    lbu a1, 1012(sp)
-; CHECK-NEXT:    vmv.s.x v24, a3
-; CHECK-NEXT:    li a2, 478
-; CHECK-NEXT:    li a3, 477
+; CHECK-NEXT:    vslidedown.vi v0, v24, 4
+; CHECK-NEXT:    li a2, 466
+; CHECK-NEXT:    li a3, 465
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vse8.v v24, (a1)
+; CHECK-NEXT:    lbu a1, 985(sp)
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v24, a3
+; CHECK-NEXT:    vslideup.vx v8, v0, a3
+; CHECK-NEXT:    li a2, 478
+; CHECK-NEXT:    lbu a3, 1012(sp)
 ; CHECK-NEXT:    vmv.s.x v24, a1
+; CHECK-NEXT:    li a1, 477
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
+; CHECK-NEXT:    vslideup.vx v8, v24, a1
 ; CHECK-NEXT:    li a1, 501
+; CHECK-NEXT:    vmv.s.x v24, a3
 ; CHECK-NEXT:    li a2, 500
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v24, a2
 ; CHECK-NEXT:    lui a1, 2761
-; CHECK-NEXT:    slli a1, a1, 25
-; CHECK-NEXT:    addi a1, a1, 501
-; CHECK-NEXT:    slli a1, a1, 13
-; CHECK-NEXT:    addi a1, a1, 512
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v24, 0
+; CHECK-NEXT:    lui a2, 4
+; CHECK-NEXT:    vmv.s.x v25, a2
 ; CHECK-NEXT:    lui a2, 1047552
 ; CHECK-NEXT:    addi a2, a2, 1
 ; CHECK-NEXT:    slli a2, a2, 23
 ; CHECK-NEXT:    addi a2, a2, 1
 ; CHECK-NEXT:    slli a2, a2, 18
 ; CHECK-NEXT:    vslide1down.vx v0, v24, a2
-; CHECK-NEXT:    lui a2, 4
-; CHECK-NEXT:    vmv.s.x v24, a2
 ; CHECK-NEXT:    li a2, 64
+; CHECK-NEXT:    slli a1, a1, 25
+; CHECK-NEXT:    addi a1, a1, 501
+; CHECK-NEXT:    slli a1, a1, 13
+; CHECK-NEXT:    addi a1, a1, 512
 ; CHECK-NEXT:    vsetivli zero, 7, e64, m1, tu, ma
-; CHECK-NEXT:    vslideup.vi v0, v24, 6
+; CHECK-NEXT:    vslideup.vi v0, v25, 6
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v24, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
index 2808ca3fd2621b..bfc43db2e369ee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-binop-splats.ll
@@ -6,11 +6,11 @@ define <1 x i1> @v1i1(i1 %x, i1 %y) {
 ; CHECK-LABEL: v1i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    vmsne.vi v8, v8, 0
-; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vmv.s.x v9, a1
+; CHECK-NEXT:    vmsne.vi v8, v8, 0
 ; CHECK-NEXT:    vmsne.vi v9, v9, 0
 ; CHECK-NEXT:    vmxor.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -26,11 +26,11 @@ define <2 x i1> @v2i1(i1 %x, i1 %y) {
 ; CHECK-LABEL: v2i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    vmsne.vi v8, v8, 0
-; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmsne.vi v8, v8, 0
 ; CHECK-NEXT:    vmsne.vi v9, v9, 0
 ; CHECK-NEXT:    vmxor.mm v0, v8, v9
 ; CHECK-NEXT:    vmv.v.i v8, 0
@@ -50,11 +50,11 @@ define <4 x i1> @v4i1(i1 %x, i1 %y) {
 ; CHECK-LABEL: v4i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    vmsne.vi v8, v8, 0
-; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmsne.vi v8, v8, 0
 ; CHECK-NEXT:    vmsne.vi v9, v9, 0
 ; CHECK-NEXT:    vmxor.mm v0, v8, v9
 ; CHECK-NEXT:    vmv.v.i v8, 0
@@ -74,11 +74,11 @@ define <8 x i1> @v8i1(i1 %x, i1 %y) {
 ; CHECK-LABEL: v8i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    vmsne.vi v8, v8, 0
-; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmsne.vi v8, v8, 0
 ; CHECK-NEXT:    vmsne.vi v9, v9, 0
 ; CHECK-NEXT:    vmxor.mm v0, v8, v9
 ; CHECK-NEXT:    vmv.v.i v8, 0
@@ -98,11 +98,11 @@ define <16 x i1> @v16i1(i1 %x, i1 %y) {
 ; CHECK-LABEL: v16i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    vmsne.vi v8, v8, 0
-; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmsne.vi v8, v8, 0
 ; CHECK-NEXT:    vmsne.vi v9, v9, 0
 ; CHECK-NEXT:    vmxor.mm v0, v8, v9
 ; CHECK-NEXT:    vmv.v.i v8, 0
@@ -123,10 +123,10 @@ define <32 x i1> @v32i1(i1 %x, i1 %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a0, a0, 1
 ; CHECK-NEXT:    li a2, 32
+; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vmsne.vi v10, v8, 0
-; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vmsne.vi v11, v8, 0
 ; CHECK-NEXT:    vmxor.mm v0, v10, v11
@@ -148,10 +148,10 @@ define <64 x i1> @v64i1(i1 %x, i1 %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    andi a0, a0, 1
 ; CHECK-NEXT:    li a2, 64
+; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vmsne.vi v12, v8, 0
-; CHECK-NEXT:    andi a1, a1, 1
 ; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vmsne.vi v13, v8, 0
 ; CHECK-NEXT:    vmxor.mm v0, v12, v13

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
index c42fabd78aabf7..3eb5d36b4896a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
@@ -11,18 +11,18 @@ define <2 x i8> @vp_bitreverse_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v9, v9, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -37,20 +37,20 @@ define <2 x i8> @vp_bitreverse_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15
-; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -65,18 +65,18 @@ define <4 x i8> @vp_bitreverse_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v9, v9, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -91,20 +91,20 @@ define <4 x i8> @vp_bitreverse_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15
-; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -119,18 +119,18 @@ define <8 x i8> @vp_bitreverse_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v9, v9, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -145,20 +145,20 @@ define <8 x i8> @vp_bitreverse_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15
-; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -173,18 +173,18 @@ define <16 x i8> @vp_bitreverse_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15, v0.t
+; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vsll.vi v9, v9, 4, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    li a0, 51
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -199,20 +199,20 @@ define <16 x i8> @vp_bitreverse_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v9, v8, 15
-; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4
+; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsll.vi v9, v9, 4
 ; CHECK-NEXT:    vand.vi v8, v8, 15
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    li a0, 51
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -227,25 +227,25 @@ define <2 x i16> @vp_bitreverse_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %e
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -261,26 +261,26 @@ define <2 x i16> @vp_bitreverse_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -295,25 +295,25 @@ define <4 x i16> @vp_bitreverse_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %e
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -329,26 +329,26 @@ define <4 x i16> @vp_bitreverse_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -363,25 +363,25 @@ define <8 x i16> @vp_bitreverse_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %e
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -397,26 +397,26 @@ define <8 x i16> @vp_bitreverse_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -431,25 +431,25 @@ define <16 x i16> @vp_bitreverse_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroex
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8, v0.t
+; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -465,26 +465,26 @@ define <16 x i16> @vp_bitreverse_v16i16_unmasked(<16 x i16> %va, i32 zeroext %ev
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 3
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    lui a0, 3
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 5
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
@@ -505,27 +505,27 @@ define <2 x i32> @vp_bitreverse_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    vsrl.vi v10, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vsll.vi v10, v10, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -541,34 +541,34 @@ define <2 x i32> @vp_bitreverse_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsll.vi v10, v10, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vsll.vi v10, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -589,27 +589,27 @@ define <4 x i32> @vp_bitreverse_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    vsrl.vi v10, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v9, v9, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vsll.vi v10, v10, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -625,34 +625,34 @@ define <4 x i32> @vp_bitreverse_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
-; CHECK-NEXT:    vand.vx v10, v8, a0
-; CHECK-NEXT:    vsll.vi v10, v10, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 4
+; CHECK-NEXT:    vsll.vi v10, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
@@ -673,27 +673,27 @@ define <8 x i32> @vp_bitreverse_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %e
 ; CHECK-NEXT:    vsrl.vi v12, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v10, v10, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vsll.vi v12, v12, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -709,34 +709,34 @@ define <8 x i32> @vp_bitreverse_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl)
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    vor.vv v10, v10, v12
-; CHECK-NEXT:    vand.vx v12, v8, a0
-; CHECK-NEXT:    vsll.vi v12, v12, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
+; CHECK-NEXT:    vsll.vi v12, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vor.vv v8, v8, v10
+; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
@@ -757,27 +757,27 @@ define <16 x i32> @vp_bitreverse_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroex
 ; CHECK-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v12, v12, v16, v0.t
 ; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
@@ -793,34 +793,34 @@ define <16 x i32> @vp_bitreverse_v16i32_unmasked(<16 x i32> %va, i32 zeroext %ev
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    vor.vv v12, v12, v16
-; CHECK-NEXT:    vand.vx v16, v8, a0
-; CHECK-NEXT:    vsll.vi v16, v16, 8
-; CHECK-NEXT:    vsll.vi v8, v8, 24
-; CHECK-NEXT:    vor.vv v8, v8, v16
-; CHECK-NEXT:    vor.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 4
+; CHECK-NEXT:    vsll.vi v16, v8, 24
+; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 61681
 ; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 8
+; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    vor.vv v8, v8, v12
+; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 209715
+; CHECK-NEXT:    addi a0, a0, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    lui a0, 349525
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
@@ -835,68 +835,67 @@ define <2 x i64> @vp_bitreverse_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %e
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    lui a5, 16
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v9, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v10, v10, a3, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV32-NEXT:    addi a4, sp, 8
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v9, (a6), zero
+; RV32-NEXT:    lui a4, 61681
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v10, v8, a3, v0.t
+; RV32-NEXT:    addi a5, a5, -256
+; RV32-NEXT:    vand.vx v11, v8, a5, v0.t
+; RV32-NEXT:    vsll.vx v11, v11, a2, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v11, v0.t
+; RV32-NEXT:    vand.vx v11, v8, a1, v0.t
 ; RV32-NEXT:    vsll.vi v11, v11, 24, v0.t
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v12, v8, v9, v0.t
 ; RV32-NEXT:    vsll.vi v12, v12, 8, v0.t
 ; RV32-NEXT:    vor.vv v11, v11, v12, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v11, v0.t
-; RV32-NEXT:    vsrl.vx v11, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v12, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a2, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v11, v0.t
+; RV32-NEXT:    vsrl.vx v11, v8, a3, v0.t
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    vsrl.vx v12, v8, a2, v0.t
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    addi a4, a4, -241
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
 ; RV32-NEXT:    vor.vv v11, v12, v11, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a4, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v9, a4
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v11, v0.t
-; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    vmv.v.x v11, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v9, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v11, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
@@ -906,59 +905,59 @@ define <2 x i64> @vp_bitreverse_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %e
 ; RV64-LABEL: vp_bitreverse_v2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a3, 255
+; RV64-NEXT:    li a2, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 209715
+; RV64-NEXT:    lui a7, 349525
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw a6, a6, 819
+; RV64-NEXT:    addiw a7, a7, 1365
+; RV64-NEXT:    slli t0, a5, 32
+; RV64-NEXT:    add t0, a5, t0
+; RV64-NEXT:    slli a5, a6, 32
+; RV64-NEXT:    add a6, a6, a5
+; RV64-NEXT:    slli a5, a7, 32
+; RV64-NEXT:    add a5, a7, a5
+; RV64-NEXT:    li a7, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
+; RV64-NEXT:    slli a3, a3, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v9, v9, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a3, v0.t
 ; RV64-NEXT:    vsll.vi v10, v10, 8, v0.t
 ; RV64-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    vsll.vx v10, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v11, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v11, v11, a4, v0.t
+; RV64-NEXT:    vand.vx v11, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v11, v11, a7, v0.t
 ; RV64-NEXT:    vor.vv v10, v10, v11, v0.t
 ; RV64-NEXT:    vor.vv v9, v10, v9, v0.t
 ; RV64-NEXT:    vsrl.vx v10, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v11, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v11, v11, a3, v0.t
+; RV64-NEXT:    vsrl.vx v11, v8, a7, v0.t
+; RV64-NEXT:    vand.vx v11, v11, a0, v0.t
 ; RV64-NEXT:    vor.vv v10, v11, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v11, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v11, v11, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v11, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v9, t0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, t0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a5, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    ret
@@ -972,67 +971,67 @@ define <2 x i64> @vp_bitreverse_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v9, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v9, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v10, v10, a3
-; RV32-NEXT:    vor.vv v9, v9, v10
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v10, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v11, v8, a2
+; RV32-NEXT:    vsrl.vx v12, v8, a4
+; RV32-NEXT:    vand.vx v13, v8, a1
+; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vor.vv v11, v12, v11
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v12, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v8, a4
-; RV32-NEXT:    vsll.vi v11, v11, 24
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsll.vi v12, v12, 8
-; RV32-NEXT:    vor.vv v11, v11, v12
-; RV32-NEXT:    vor.vv v9, v9, v11
-; RV32-NEXT:    vsrl.vx v11, v8, a1
-; RV32-NEXT:    vsrl.vx v12, v8, a3
-; RV32-NEXT:    vand.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v11, v12, v11
-; RV32-NEXT:    vsrl.vi v12, v8, 24
-; RV32-NEXT:    vand.vx v12, v12, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v8, v11
-; RV32-NEXT:    vor.vv v8, v9, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vsll.vx v13, v13, a4
+; RV32-NEXT:    vor.vv v10, v10, v13
+; RV32-NEXT:    vsrl.vi v13, v8, 8
+; RV32-NEXT:    vand.vx v9, v9, a5
+; RV32-NEXT:    vand.vv v13, v13, v12
+; RV32-NEXT:    vor.vv v9, v13, v9
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
+; RV32-NEXT:    vand.vv v12, v8, v12
+; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v12, v12, 8
+; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v9, v9, v11
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v11, a2
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v9, v9, v12
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vand.vv v9, v9, v11
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1042,59 +1041,59 @@ define <2 x i64> @vp_bitreverse_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl)
 ; RV64-LABEL: vp_bitreverse_v2i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vand.vx v9, v8, a1
-; RV64-NEXT:    vsll.vi v9, v9, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v10, v8, a0
-; RV64-NEXT:    vsll.vi v10, v10, 8
-; RV64-NEXT:    vor.vv v9, v9, v10
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v10, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v11, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v11, v11, a4
-; RV64-NEXT:    vor.vv v10, v10, v11
+; RV64-NEXT:    vsrl.vi v9, v8, 24
+; RV64-NEXT:    vsrl.vi v10, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v11, v8, a3
+; RV64-NEXT:    vsrl.vx v12, v8, a5
+; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    vor.vv v11, v12, v11
+; RV64-NEXT:    vand.vx v12, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v9, v9, a1
+; RV64-NEXT:    vsll.vi v12, v12, 24
+; RV64-NEXT:    vand.vx v10, v10, a2
 ; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsrl.vx v10, v8, a2
-; RV64-NEXT:    vsrl.vx v11, v8, a4
-; RV64-NEXT:    vand.vx v11, v11, a3
-; RV64-NEXT:    vor.vv v10, v11, v10
-; RV64-NEXT:    vsrl.vi v11, v8, 24
-; RV64-NEXT:    vand.vx v11, v11, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v10, v8, a2
+; RV64-NEXT:    vsll.vi v10, v10, 8
+; RV64-NEXT:    vor.vv v10, v12, v10
+; RV64-NEXT:    vsll.vx v12, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v11
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vor.vv v8, v9, v8
-; RV64-NEXT:    vsrl.vi v9, v8, 4
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    vor.vv v9, v9, v11
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    vor.vv v8, v8, v9
+; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v9, v9, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    ret
@@ -1109,68 +1108,67 @@ define <4 x i64> @vp_bitreverse_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %e
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    lui a5, 16
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vx v10, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v12, v12, a3, v0.t
-; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
-; RV32-NEXT:    addi a4, sp, 8
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v10, (a6), zero
+; RV32-NEXT:    lui a4, 61681
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v14, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v12, v8, a3, v0.t
+; RV32-NEXT:    addi a5, a5, -256
+; RV32-NEXT:    vand.vx v14, v8, a5, v0.t
+; RV32-NEXT:    vsll.vx v14, v14, a2, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v14, v0.t
+; RV32-NEXT:    vand.vx v14, v8, a1, v0.t
 ; RV32-NEXT:    vsll.vi v14, v14, 24, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v10, v0.t
 ; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; RV32-NEXT:    vor.vv v14, v14, v16, v0.t
-; RV32-NEXT:    vor.vv v10, v10, v14, v0.t
-; RV32-NEXT:    vsrl.vx v14, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v14, v0.t
+; RV32-NEXT:    vsrl.vx v14, v8, a3, v0.t
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    addi a4, a4, -241
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV32-NEXT:    vor.vv v14, v16, v14, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a4, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a4
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v14, v0.t
-; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v14, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT:    vand.vv v12, v12, v10, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v14, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v14, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1180,59 +1178,59 @@ define <4 x i64> @vp_bitreverse_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %e
 ; RV64-LABEL: vp_bitreverse_v4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a3, 255
+; RV64-NEXT:    li a2, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 209715
+; RV64-NEXT:    lui a7, 349525
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw a6, a6, 819
+; RV64-NEXT:    addiw a7, a7, 1365
+; RV64-NEXT:    slli t0, a5, 32
+; RV64-NEXT:    add t0, a5, t0
+; RV64-NEXT:    slli a5, a6, 32
+; RV64-NEXT:    add a6, a6, a5
+; RV64-NEXT:    slli a5, a7, 32
+; RV64-NEXT:    add a5, a7, a5
+; RV64-NEXT:    li a7, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
+; RV64-NEXT:    slli a3, a3, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v10, v10, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a3, v0.t
 ; RV64-NEXT:    vsll.vi v12, v12, 8, v0.t
 ; RV64-NEXT:    vor.vv v10, v10, v12, v0.t
-; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    vsll.vx v12, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v14, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v14, v14, a4, v0.t
+; RV64-NEXT:    vand.vx v14, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v14, v14, a7, v0.t
 ; RV64-NEXT:    vor.vv v12, v12, v14, v0.t
 ; RV64-NEXT:    vor.vv v10, v12, v10, v0.t
 ; RV64-NEXT:    vsrl.vx v12, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v14, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v14, v14, a3, v0.t
+; RV64-NEXT:    vsrl.vx v14, v8, a7, v0.t
+; RV64-NEXT:    vand.vx v14, v14, a0, v0.t
 ; RV64-NEXT:    vor.vv v12, v14, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v14, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v14, v14, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v14, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v10, t0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, t0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a5, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    ret
@@ -1246,67 +1244,67 @@ define <4 x i64> @vp_bitreverse_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v14, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vx v10, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v12, v12, a3
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v12, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v10, v8, a2
+; RV32-NEXT:    vsrl.vx v16, v8, a4
+; RV32-NEXT:    vand.vx v18, v8, a1
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vor.vv v10, v16, v10
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v16, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v14, v8, a4
-; RV32-NEXT:    vsll.vi v14, v14, 24
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsll.vi v16, v16, 8
-; RV32-NEXT:    vor.vv v14, v14, v16
-; RV32-NEXT:    vor.vv v10, v10, v14
-; RV32-NEXT:    vsrl.vx v14, v8, a1
-; RV32-NEXT:    vsrl.vx v16, v8, a3
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vor.vv v14, v16, v14
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v8, v14
-; RV32-NEXT:    vor.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vsll.vx v18, v18, a4
+; RV32-NEXT:    vor.vv v12, v12, v18
+; RV32-NEXT:    vsrl.vi v18, v8, 8
+; RV32-NEXT:    vand.vx v14, v14, a5
+; RV32-NEXT:    vand.vv v18, v18, v16
+; RV32-NEXT:    vor.vv v14, v18, v14
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v16, v16, 8
+; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v10, v14, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v14, a2
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v10, v10, v16
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v8, v8, v14
+; RV32-NEXT:    vand.vv v10, v10, v14
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1316,59 +1314,59 @@ define <4 x i64> @vp_bitreverse_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl)
 ; RV64-LABEL: vp_bitreverse_v4i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsll.vi v10, v10, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v12, v8, a0
-; RV64-NEXT:    vsll.vi v12, v12, 8
-; RV64-NEXT:    vor.vv v10, v10, v12
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v12, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v14, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v14, v14, a4
-; RV64-NEXT:    vor.vv v12, v12, v14
-; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vsrl.vx v12, v8, a2
-; RV64-NEXT:    vsrl.vx v14, v8, a4
-; RV64-NEXT:    vand.vx v14, v14, a3
+; RV64-NEXT:    vsrl.vi v12, v8, 24
+; RV64-NEXT:    vsrl.vi v14, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v10, v8, a3
+; RV64-NEXT:    vsrl.vx v16, v8, a5
+; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vor.vv v10, v16, v10
+; RV64-NEXT:    vand.vx v16, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v12, v12, a1
+; RV64-NEXT:    vsll.vi v16, v16, 24
+; RV64-NEXT:    vand.vx v14, v14, a2
 ; RV64-NEXT:    vor.vv v12, v14, v12
-; RV64-NEXT:    vsrl.vi v14, v8, 24
-; RV64-NEXT:    vand.vx v14, v14, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v14, v8, a2
+; RV64-NEXT:    vsll.vi v14, v14, 8
+; RV64-NEXT:    vor.vv v14, v16, v14
+; RV64-NEXT:    vsll.vx v16, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v14
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vor.vv v8, v10, v8
-; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    vor.vv v10, v12, v10
+; RV64-NEXT:    vor.vv v8, v8, v14
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v10, v10, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v10, v10, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    ret
@@ -1383,70 +1381,69 @@ define <8 x i64> @vp_bitreverse_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %e
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a4, 1044480
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    lui a5, 16
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    sw a4, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vx v12, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v16, v16, a3, v0.t
-; RV32-NEXT:    vor.vv v16, v12, v16, v0.t
-; RV32-NEXT:    addi a4, sp, 8
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v12, (a6), zero
+; RV32-NEXT:    lui a4, 61681
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v16, v8, a3, v0.t
+; RV32-NEXT:    addi a5, a5, -256
+; RV32-NEXT:    vand.vx v20, v8, a5, v0.t
+; RV32-NEXT:    vsll.vx v20, v20, a2, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
+; RV32-NEXT:    vand.vx v20, v8, a1, v0.t
 ; RV32-NEXT:    vsll.vi v20, v20, 24, v0.t
 ; RV32-NEXT:    vand.vv v24, v8, v12, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV32-NEXT:    vor.vv v20, v20, v24, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vsrl.vx v20, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v20, v8, a3, v0.t
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    vsrl.vx v24, v8, a2, v0.t
+; RV32-NEXT:    lui a2, 349525
+; RV32-NEXT:    addi a4, a4, -241
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    addi a2, a2, 1365
+; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
 ; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v20, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v28, a4
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v12, a3
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vor.vv v20, v8, v20, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v8, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
+; RV32-NEXT:    vsrl.vi v20, v16, 4, v0.t
+; RV32-NEXT:    vand.vv v20, v20, v28, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v28, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 4, v0.t
+; RV32-NEXT:    vor.vv v16, v20, v16, v0.t
+; RV32-NEXT:    vsrl.vi v20, v16, 2, v0.t
+; RV32-NEXT:    vand.vv v20, v20, v12, v0.t
+; RV32-NEXT:    vand.vv v12, v16, v12, v0.t
+; RV32-NEXT:    vsll.vi v12, v12, 2, v0.t
+; RV32-NEXT:    vor.vv v12, v20, v12, v0.t
+; RV32-NEXT:    vsrl.vi v16, v12, 1, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v12, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
@@ -1454,59 +1451,59 @@ define <8 x i64> @vp_bitreverse_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %e
 ; RV64-LABEL: vp_bitreverse_v8i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a3, 255
+; RV64-NEXT:    li a2, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 209715
+; RV64-NEXT:    lui a7, 349525
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw a6, a6, 819
+; RV64-NEXT:    addiw a7, a7, 1365
+; RV64-NEXT:    slli t0, a5, 32
+; RV64-NEXT:    add t0, a5, t0
+; RV64-NEXT:    slli a5, a6, 32
+; RV64-NEXT:    add a6, a6, a5
+; RV64-NEXT:    slli a5, a7, 32
+; RV64-NEXT:    add a5, a7, a5
+; RV64-NEXT:    li a7, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
+; RV64-NEXT:    slli a3, a3, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v12, v12, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
 ; RV64-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; RV64-NEXT:    vor.vv v12, v12, v16, v0.t
-; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v20, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v20, v20, a4, v0.t
+; RV64-NEXT:    vand.vx v20, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v20, v20, a7, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v20, v0.t
 ; RV64-NEXT:    vor.vv v12, v16, v12, v0.t
 ; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v20, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v20, v20, a3, v0.t
+; RV64-NEXT:    vsrl.vx v20, v8, a7, v0.t
+; RV64-NEXT:    vand.vx v20, v20, a0, v0.t
 ; RV64-NEXT:    vor.vv v16, v20, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v20, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v20, v20, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v20, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v12, t0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, t0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a5, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    ret
@@ -1520,67 +1517,67 @@ define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl)
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v20, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vx v12, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v16, v16, a3
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v16, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v12, v8, a2
+; RV32-NEXT:    vsrl.vx v24, v8, a4
+; RV32-NEXT:    vand.vx v28, v8, a1
+; RV32-NEXT:    vand.vx v24, v24, a1
+; RV32-NEXT:    vor.vv v12, v24, v12
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v24, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v8, a4
-; RV32-NEXT:    vsll.vi v20, v20, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
-; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v20, v20, v24
-; RV32-NEXT:    vor.vv v12, v12, v20
-; RV32-NEXT:    vsrl.vx v20, v8, a1
-; RV32-NEXT:    vsrl.vx v24, v8, a3
-; RV32-NEXT:    vand.vx v24, v24, a2
-; RV32-NEXT:    vor.vv v20, v24, v20
-; RV32-NEXT:    vsrl.vi v24, v8, 24
-; RV32-NEXT:    vand.vx v24, v24, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vor.vv v8, v8, v20
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vsll.vx v28, v28, a4
+; RV32-NEXT:    vor.vv v16, v16, v28
+; RV32-NEXT:    vsrl.vi v28, v8, 8
+; RV32-NEXT:    vand.vx v20, v20, a5
+; RV32-NEXT:    vand.vv v28, v28, v24
+; RV32-NEXT:    vor.vv v20, v28, v20
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    lui a2, 209715
+; RV32-NEXT:    lui a3, 349525
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vand.vx v8, v8, a5
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    addi a3, a3, 1365
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v24, v24, 8
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v12, v20, v12
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v20, a2
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a3
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v12
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v12, v12, v24
 ; RV32-NEXT:    vsll.vi v8, v8, 4
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v20
+; RV32-NEXT:    vand.vv v12, v12, v20
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1590,59 +1587,59 @@ define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl)
 ; RV64-LABEL: vp_bitreverse_v8i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vand.vx v12, v8, a1
-; RV64-NEXT:    vsll.vi v12, v12, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v16, v8, a0
-; RV64-NEXT:    vsll.vi v16, v16, 8
-; RV64-NEXT:    vor.vv v12, v12, v16
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v16, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v20, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v20, v20, a4
-; RV64-NEXT:    vor.vv v16, v16, v20
-; RV64-NEXT:    vor.vv v12, v16, v12
-; RV64-NEXT:    vsrl.vx v16, v8, a2
-; RV64-NEXT:    vsrl.vx v20, v8, a4
-; RV64-NEXT:    vand.vx v20, v20, a3
+; RV64-NEXT:    vsrl.vi v16, v8, 24
+; RV64-NEXT:    vsrl.vi v20, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v12, v8, a3
+; RV64-NEXT:    vsrl.vx v24, v8, a5
+; RV64-NEXT:    vand.vx v24, v24, a0
+; RV64-NEXT:    vor.vv v12, v24, v12
+; RV64-NEXT:    vand.vx v24, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v16, v16, a1
+; RV64-NEXT:    vsll.vi v24, v24, 24
+; RV64-NEXT:    vand.vx v20, v20, a2
 ; RV64-NEXT:    vor.vv v16, v20, v16
-; RV64-NEXT:    vsrl.vi v20, v8, 24
-; RV64-NEXT:    vand.vx v20, v20, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v20, v8, a2
+; RV64-NEXT:    vsll.vi v20, v20, 8
+; RV64-NEXT:    vor.vv v20, v24, v20
+; RV64-NEXT:    vsll.vx v24, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v20
-; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vor.vv v8, v12, v8
-; RV64-NEXT:    vsrl.vi v12, v8, 4
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v24, v8
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    vor.vv v12, v16, v12
+; RV64-NEXT:    vor.vv v8, v8, v20
+; RV64-NEXT:    vor.vv v8, v8, v12
+; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v12, v12, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v12, v12, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    ret
@@ -1662,116 +1659,117 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw zero, 20(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    vmv8r.v v24, v8
+; RV32-NEXT:    lui a2, 1044480
+; RV32-NEXT:    lui a3, 61681
+; RV32-NEXT:    lui a4, 209715
+; RV32-NEXT:    lui a5, 349525
 ; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    lui a6, 16
+; RV32-NEXT:    sw a2, 16(sp)
+; RV32-NEXT:    sw zero, 20(sp)
+; RV32-NEXT:    addi a2, a3, -241
+; RV32-NEXT:    sw a2, 40(sp)
+; RV32-NEXT:    sw a2, 44(sp)
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    addi a3, a4, 819
+; RV32-NEXT:    sw a3, 32(sp)
+; RV32-NEXT:    sw a3, 36(sp)
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    addi a4, a5, 1365
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    addi a5, a6, -256
+; RV32-NEXT:    sw a4, 24(sp)
+; RV32-NEXT:    sw a4, 28(sp)
+; RV32-NEXT:    vand.vx v8, v8, a5, v0.t
+; RV32-NEXT:    vsll.vx v8, v8, a2, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 48
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
+; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a3, 4080
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vx v16, v24, a3, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
+; RV32-NEXT:    addi a4, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 48
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a5, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 4
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 48
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 4
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 48
-; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vx v16, v24, a1, v0.t
+; RV32-NEXT:    vsrl.vx v8, v24, a2, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a5, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v24, 24, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV32-NEXT:    vsrl.vi v8, v24, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    addi a2, sp, 32
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 4, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vi v16, v16, 4, v0.t
-; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsll.vi v24, v24, 4, v0.t
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vi v16, v16, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
@@ -1790,66 +1788,65 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV64-NEXT:    addi a5, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
@@ -1869,91 +1866,98 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    lui a2, 61681
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    lui a4, 349525
+; RV32-NEXT:    li a5, 56
+; RV32-NEXT:    lui a6, 16
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsll.vx v16, v8, a5
+; RV32-NEXT:    vsrl.vx v24, v8, a5
+; RV32-NEXT:    li a5, 40
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw zero, 20(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a1, a2, -241
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    addi a4, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a2, a3, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    addi a3, a4, 1365
+; RV32-NEXT:    addi a4, a6, -256
+; RV32-NEXT:    vsrl.vx v0, v8, a5
+; RV32-NEXT:    sw a3, 24(sp)
+; RV32-NEXT:    sw a3, 28(sp)
+; RV32-NEXT:    vand.vx v0, v0, a4
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    addi a3, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a4
+; RV32-NEXT:    vsll.vx v0, v0, a5
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v0, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v0, v8, a4
-; RV32-NEXT:    vsll.vi v0, v0, 24
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsll.vi v16, v16, 8
-; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    addi a5, sp, 48
-; RV32-NEXT:    vl8r.v v0, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v0, v8, a3
-; RV32-NEXT:    vand.vx v0, v0, a2
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    vor.vv v0, v0, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vand.vx v8, v8, a1
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v24, v24, 8
+; RV32-NEXT:    vor.vv v0, v8, v24
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    addi a2, sp, 32
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vand.vv v24, v24, v16
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vand.vv v16, v24, v16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 2
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 2
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 1
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vadd.vv v8, v8, v8
-; RV32-NEXT:    vor.vv v8, v24, v8
+; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
 ; RV32-NEXT:    addi sp, sp, 48
@@ -1962,62 +1966,78 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev
 ;
 ; RV64-LABEL: vp_bitreverse_v15i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0
-; RV64-NEXT:    vsll.vi v24, v24, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v0, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v0, v0, a4
-; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vx v24, v8, a2
-; RV64-NEXT:    vsrl.vx v0, v8, a4
-; RV64-NEXT:    vand.vx v0, v0, a3
+; RV64-NEXT:    vsrl.vi v24, v8, 24
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v16, v8, a3
+; RV64-NEXT:    vsrl.vx v0, v8, a5
+; RV64-NEXT:    vand.vx v0, v0, a0
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v0, v8, 8
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v0, v0, a2
 ; RV64-NEXT:    vor.vv v24, v0, v24
-; RV64-NEXT:    vsrl.vi v0, v8, 24
-; RV64-NEXT:    vand.vx v0, v0, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v0, v8, a1
+; RV64-NEXT:    vsll.vi v0, v0, 24
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v0, v0, v16
+; RV64-NEXT:    vsll.vx v16, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v0
-; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsll.vx v8, v8, a5
 ; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vor.vv v8, v8, v0
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v16, v16, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.bitreverse.v15i64(<15 x i64> %va, <15 x i1> splat (i1 true), i32 %evl)
   ret <15 x i64> %v
@@ -2035,116 +2055,117 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw zero, 20(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    sw a1, 40(sp)
-; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
+; RV32-NEXT:    vmv8r.v v24, v8
+; RV32-NEXT:    lui a2, 1044480
+; RV32-NEXT:    lui a3, 61681
+; RV32-NEXT:    lui a4, 209715
+; RV32-NEXT:    lui a5, 349525
 ; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    lui a6, 16
+; RV32-NEXT:    sw a2, 16(sp)
+; RV32-NEXT:    sw zero, 20(sp)
+; RV32-NEXT:    addi a2, a3, -241
+; RV32-NEXT:    sw a2, 40(sp)
+; RV32-NEXT:    sw a2, 44(sp)
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    addi a3, a4, 819
+; RV32-NEXT:    sw a3, 32(sp)
+; RV32-NEXT:    sw a3, 36(sp)
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    addi a4, a5, 1365
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
-; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    addi a5, a6, -256
+; RV32-NEXT:    sw a4, 24(sp)
+; RV32-NEXT:    sw a4, 28(sp)
+; RV32-NEXT:    vand.vx v8, v8, a5, v0.t
+; RV32-NEXT:    vsll.vx v8, v8, a2, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 48
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
+; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a3, 4080
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vx v16, v24, a3, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
+; RV32-NEXT:    addi a4, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 48
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a5, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 4
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 48
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 4
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 48
-; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vx v16, v24, a1, v0.t
+; RV32-NEXT:    vsrl.vx v8, v24, a2, v0.t
+; RV32-NEXT:    vand.vx v8, v8, a5, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v24, 24, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV32-NEXT:    vsrl.vi v8, v24, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    addi a2, sp, 32
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 4, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vi v16, v16, 4, v0.t
-; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v16, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsll.vi v24, v24, 4, v0.t
+; RV32-NEXT:    vor.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vi v16, v16, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsll.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsll.vi v8, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vor.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
@@ -2163,66 +2184,65 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV64-NEXT:    addi a5, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
@@ -2242,91 +2262,98 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    lui a2, 61681
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    lui a4, 349525
+; RV32-NEXT:    li a5, 56
+; RV32-NEXT:    lui a6, 16
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsll.vx v16, v8, a5
+; RV32-NEXT:    vsrl.vx v24, v8, a5
+; RV32-NEXT:    li a5, 40
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw zero, 20(sp)
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    addi a1, a2, -241
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    sw a1, 24(sp)
-; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    addi a4, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a2, a3, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    addi a3, a4, 1365
+; RV32-NEXT:    addi a4, a6, -256
+; RV32-NEXT:    vsrl.vx v0, v8, a5
+; RV32-NEXT:    sw a3, 24(sp)
+; RV32-NEXT:    sw a3, 28(sp)
+; RV32-NEXT:    vand.vx v0, v0, a4
+; RV32-NEXT:    vor.vv v24, v0, v24
+; RV32-NEXT:    addi a3, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a4
+; RV32-NEXT:    vsll.vx v0, v0, a5
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v0, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v0, v8, a4
-; RV32-NEXT:    vsll.vi v0, v0, 24
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsll.vi v16, v16, 8
-; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    addi a5, sp, 48
-; RV32-NEXT:    vl8r.v v0, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v0, v16
-; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v0, v8, a3
-; RV32-NEXT:    vand.vx v0, v0, a2
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    vor.vv v0, v0, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vand.vv v16, v16, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vand.vx v8, v8, a1
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v24, v24, 8
+; RV32-NEXT:    vor.vv v0, v8, v24
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    addi a2, sp, 32
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v24, v16, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vor.vv v8, v8, v0
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vand.vv v24, v24, v16
 ; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vand.vv v16, v24, v16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 2
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
-; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsll.vi v8, v8, 2
-; RV32-NEXT:    vor.vv v8, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v8, 1
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vsrl.vi v16, v8, 1
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vadd.vv v8, v8, v8
-; RV32-NEXT:    vor.vv v8, v24, v8
+; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
 ; RV32-NEXT:    addi sp, sp, 48
@@ -2335,62 +2362,78 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev
 ;
 ; RV64-LABEL: vp_bitreverse_v16i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0
-; RV64-NEXT:    vsll.vi v24, v24, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v0, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v0, v0, a4
-; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vx v24, v8, a2
-; RV64-NEXT:    vsrl.vx v0, v8, a4
-; RV64-NEXT:    vand.vx v0, v0, a3
+; RV64-NEXT:    vsrl.vi v24, v8, 24
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v16, v8, a3
+; RV64-NEXT:    vsrl.vx v0, v8, a5
+; RV64-NEXT:    vand.vx v0, v0, a0
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v0, v8, 8
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v0, v0, a2
 ; RV64-NEXT:    vor.vv v24, v0, v24
-; RV64-NEXT:    vsrl.vi v0, v8, 24
-; RV64-NEXT:    vand.vx v0, v0, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v0, v8, a1
+; RV64-NEXT:    vsll.vi v0, v0, 24
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v0, v0, v16
+; RV64-NEXT:    vsll.vx v16, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v0
-; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsll.vx v8, v8, a5
 ; RV64-NEXT:    vor.vv v8, v16, v8
-; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    lui a0, 61681
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vor.vv v8, v8, v0
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 2
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v16, v16, a2
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.bitreverse.v16i64(<16 x i64> %va, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i64> %v
@@ -2422,58 +2465,58 @@ define <128 x i16> @vp_bitreverse_v128i16(<128 x i16> %va, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; CHECK-NEXT:    lui a1, 1
+; CHECK-NEXT:    lui a2, 3
+; CHECK-NEXT:    addi a3, a0, -64
+; CHECK-NEXT:    sltu a0, a0, a3
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a3, a0, a3
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
+; CHECK-NEXT:    addi a4, a1, -241
+; CHECK-NEXT:    addi a1, a2, 819
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    addi a1, a1, -241
-; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a4, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a4, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; CHECK-NEXT:    lui a2, 3
-; CHECK-NEXT:    addi a2, a2, 819
-; CHECK-NEXT:    vand.vx v16, v16, a2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    lui a3, 5
-; CHECK-NEXT:    addi a3, a3, 1365
-; CHECK-NEXT:    vand.vx v16, v16, a3, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a3, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a4, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a4, a0, -64
-; CHECK-NEXT:    sltu a0, a0, a4
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a4
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a3, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a4, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a4, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a3, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a3, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vor.vv v16, v16, v8, v0.t
 ; CHECK-NEXT:    addi a0, sp, 16
@@ -2501,49 +2544,53 @@ define <128 x i16> @vp_bitreverse_v128i16_unmasked(<128 x i16> %va, i32 zeroext
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v24, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    vor.vv v8, v8, v24
-; CHECK-NEXT:    vsrl.vi v24, v8, 4
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    addi a1, a1, -241
-; CHECK-NEXT:    vand.vx v24, v24, a1
-; CHECK-NEXT:    vand.vx v8, v8, a1
-; CHECK-NEXT:    vsll.vi v8, v8, 4
-; CHECK-NEXT:    vor.vv v8, v24, v8
-; CHECK-NEXT:    vsrl.vi v24, v8, 2
-; CHECK-NEXT:    lui a2, 3
-; CHECK-NEXT:    addi a2, a2, 819
-; CHECK-NEXT:    vand.vx v24, v24, a2
-; CHECK-NEXT:    vand.vx v8, v8, a2
-; CHECK-NEXT:    vsll.vi v8, v8, 2
-; CHECK-NEXT:    vor.vv v8, v24, v8
-; CHECK-NEXT:    vsrl.vi v24, v8, 1
-; CHECK-NEXT:    lui a3, 5
-; CHECK-NEXT:    addi a3, a3, 1365
-; CHECK-NEXT:    vand.vx v24, v24, a3
-; CHECK-NEXT:    vand.vx v8, v8, a3
-; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vor.vv v8, v24, v8
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    lui a3, 3
 ; CHECK-NEXT:    addi a4, a0, -64
 ; CHECK-NEXT:    sltu a0, a0, a4
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a4
+; CHECK-NEXT:    lui a4, 5
+; CHECK-NEXT:    vor.vv v8, v8, v24
+; CHECK-NEXT:    addi a2, a2, -241
+; CHECK-NEXT:    addi a3, a3, 819
+; CHECK-NEXT:    addi a4, a4, 1365
+; CHECK-NEXT:    vsrl.vi v24, v8, 4
+; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vand.vx v24, v24, a2
+; CHECK-NEXT:    vsll.vi v8, v8, 4
+; CHECK-NEXT:    vor.vv v8, v24, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v24, v16, 8
 ; CHECK-NEXT:    vsll.vi v16, v16, 8
 ; CHECK-NEXT:    vor.vv v16, v16, v24
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsrl.vi v24, v8, 2
+; CHECK-NEXT:    vand.vx v8, v8, a3
+; CHECK-NEXT:    vand.vx v24, v24, a3
+; CHECK-NEXT:    vsll.vi v8, v8, 2
+; CHECK-NEXT:    vor.vv v8, v24, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v24, v16, 4
-; CHECK-NEXT:    vand.vx v24, v24, a1
-; CHECK-NEXT:    vand.vx v16, v16, a1
+; CHECK-NEXT:    vand.vx v16, v16, a2
+; CHECK-NEXT:    vand.vx v24, v24, a2
 ; CHECK-NEXT:    vsll.vi v16, v16, 4
 ; CHECK-NEXT:    vor.vv v16, v24, v16
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsrl.vi v24, v8, 1
+; CHECK-NEXT:    vand.vx v8, v8, a4
+; CHECK-NEXT:    vand.vx v24, v24, a4
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vor.vv v8, v24, v8
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v24, v16, 2
-; CHECK-NEXT:    vand.vx v24, v24, a2
-; CHECK-NEXT:    vand.vx v16, v16, a2
+; CHECK-NEXT:    vand.vx v16, v16, a3
+; CHECK-NEXT:    vand.vx v24, v24, a3
 ; CHECK-NEXT:    vsll.vi v16, v16, 2
 ; CHECK-NEXT:    vor.vv v16, v24, v16
 ; CHECK-NEXT:    vsrl.vi v24, v16, 1
-; CHECK-NEXT:    vand.vx v24, v24, a3
-; CHECK-NEXT:    vand.vx v16, v16, a3
+; CHECK-NEXT:    vand.vx v16, v16, a4
+; CHECK-NEXT:    vand.vx v24, v24, a4
 ; CHECK-NEXT:    vadd.vv v16, v16, v16
 ; CHECK-NEXT:    vor.vv v16, v24, v16
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
index 584f8520ab62fa..946ca4d1ab904a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll
@@ -9,28 +9,28 @@ define void @bitreverse_v8i16(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 1
+; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    addi a1, a1, -241
-; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    lui a1, 3
+; CHECK-NEXT:    addi a1, a1, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a1, 3
-; CHECK-NEXT:    addi a1, a1, 819
-; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    lui a1, 5
+; CHECK-NEXT:    addi a1, a1, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a1, 5
-; CHECK-NEXT:    addi a1, a1, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vse16.v v8, (a0)
@@ -56,36 +56,36 @@ define void @bitreverse_v4i32(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a1, 16
 ; CHECK-NEXT:    addi a1, a1, -256
-; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 24
+; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vor.vv v9, v9, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a1
-; CHECK-NEXT:    vsll.vi v10, v10, 8
+; CHECK-NEXT:    lui a1, 61681
+; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vsll.vi v8, v8, 24
+; CHECK-NEXT:    vsll.vi v10, v10, 8
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
-; CHECK-NEXT:    lui a1, 61681
-; CHECK-NEXT:    addi a1, a1, -241
-; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    lui a1, 209715
+; CHECK-NEXT:    addi a1, a1, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
-; CHECK-NEXT:    lui a1, 209715
-; CHECK-NEXT:    addi a1, a1, 819
-; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    lui a1, 349525
+; CHECK-NEXT:    addi a1, a1, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a1, 349525
-; CHECK-NEXT:    addi a1, a1, 1365
-; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    vse32.v v8, (a0)
@@ -113,65 +113,65 @@ define void @bitreverse_v2i64(ptr %x, ptr %y) {
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a2, 1044480
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 16
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    sw a2, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsrl.vx v9, v8, a1
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v10, v8, a2
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v10, v10, a3
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsrl.vi v10, v8, 24
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vlse64.v v11, (a4), zero
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v10, v10, a4
-; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vand.vv v12, v12, v11
+; RV32-NEXT:    addi a2, a5, -256
+; RV32-NEXT:    vlse64.v v9, (a6), zero
+; RV32-NEXT:    vsrl.vx v10, v8, a3
+; RV32-NEXT:    vsrl.vx v11, v8, a4
+; RV32-NEXT:    vsrl.vi v12, v8, 24
+; RV32-NEXT:    vsll.vx v13, v8, a3
+; RV32-NEXT:    vand.vx v11, v11, a2
+; RV32-NEXT:    vor.vv v10, v11, v10
+; RV32-NEXT:    vand.vx v11, v8, a2
+; RV32-NEXT:    vsll.vx v11, v11, a4
+; RV32-NEXT:    vor.vv v11, v13, v11
+; RV32-NEXT:    vsrl.vi v13, v8, 8
+; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vand.vv v13, v13, v9
+; RV32-NEXT:    vor.vv v12, v13, v12
+; RV32-NEXT:    lui a2, 61681
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    lui a4, 349525
+; RV32-NEXT:    addi a2, a2, -241
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    addi a4, a4, 1365
 ; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsll.vx v10, v8, a1
-; RV32-NEXT:    vand.vx v12, v8, a3
-; RV32-NEXT:    vsll.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    vand.vx v12, v8, a4
-; RV32-NEXT:    vsll.vi v12, v12, 24
-; RV32-NEXT:    vand.vv v8, v8, v11
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a2
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v9, v8, v9
+; RV32-NEXT:    vand.vx v8, v8, a1
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v9, v9, 8
 ; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    vmv.v.x v9, a3
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    vor.vv v8, v9, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vor.vv v8, v11, v8
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    vmv.v.x v11, a4
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v10, v10, v12
+; RV32-NEXT:    vsll.vi v8, v8, 4
+; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v10, v9
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v8, v8, v11
+; RV32-NEXT:    vand.vv v9, v9, v11
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v9, v8
 ; RV32-NEXT:    vse64.v v8, (a0)
@@ -184,58 +184,58 @@ define void @bitreverse_v2i64(ptr %x, ptr %y) {
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    li a1, 56
-; RV64-NEXT:    vsrl.vx v9, v8, a1
 ; RV64-NEXT:    li a2, 40
-; RV64-NEXT:    vsrl.vx v10, v8, a2
 ; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v10, v10, a3
-; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsrl.vi v10, v8, 24
 ; RV64-NEXT:    lui a4, 4080
-; RV64-NEXT:    vand.vx v10, v10, a4
-; RV64-NEXT:    vsrl.vi v11, v8, 8
 ; RV64-NEXT:    li a5, 255
+; RV64-NEXT:    addiw a3, a3, -256
 ; RV64-NEXT:    slli a5, a5, 24
-; RV64-NEXT:    vand.vx v11, v11, a5
-; RV64-NEXT:    vor.vv v10, v11, v10
+; RV64-NEXT:    vsrl.vx v9, v8, a1
+; RV64-NEXT:    vsrl.vx v10, v8, a2
+; RV64-NEXT:    vsrl.vi v11, v8, 24
+; RV64-NEXT:    vsrl.vi v12, v8, 8
+; RV64-NEXT:    vand.vx v10, v10, a3
 ; RV64-NEXT:    vor.vv v9, v10, v9
 ; RV64-NEXT:    vand.vx v10, v8, a5
+; RV64-NEXT:    vand.vx v11, v11, a4
+; RV64-NEXT:    vand.vx v12, v12, a5
+; RV64-NEXT:    vor.vv v11, v12, v11
+; RV64-NEXT:    vand.vx v12, v8, a4
 ; RV64-NEXT:    vsll.vi v10, v10, 8
-; RV64-NEXT:    vand.vx v11, v8, a4
-; RV64-NEXT:    vsll.vi v11, v11, 24
-; RV64-NEXT:    vor.vv v10, v11, v10
-; RV64-NEXT:    vsll.vx v11, v8, a1
+; RV64-NEXT:    vsll.vi v12, v12, 24
+; RV64-NEXT:    vor.vv v10, v12, v10
+; RV64-NEXT:    vsll.vx v12, v8, a1
 ; RV64-NEXT:    vand.vx v8, v8, a3
 ; RV64-NEXT:    vsll.vx v8, v8, a2
-; RV64-NEXT:    vor.vv v8, v11, v8
+; RV64-NEXT:    vor.vv v8, v12, v8
+; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 349525
+; RV64-NEXT:    addiw a1, a1, -241
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, 1365
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vor.vv v9, v11, v9
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
-; RV64-NEXT:    lui a1, 61681
-; RV64-NEXT:    addiw a1, a1, -241
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 2
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    addiw a1, a1, 819
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v9, v9, a1
-; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v9, v9, a2
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v9, v9, a1
-; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vand.vx v9, v9, a3
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v9, v8
 ; RV64-NEXT:    vse64.v v8, (a0)
@@ -261,28 +261,28 @@ define void @bitreverse_v16i16(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 1
+; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    addi a1, a1, -241
-; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v10, v10, a1
+; CHECK-NEXT:    lui a1, 3
+; CHECK-NEXT:    addi a1, a1, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    lui a1, 3
-; CHECK-NEXT:    addi a1, a1, 819
-; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v10, v10, a1
+; CHECK-NEXT:    lui a1, 5
+; CHECK-NEXT:    addi a1, a1, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a1, 5
-; CHECK-NEXT:    addi a1, a1, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vse16.v v8, (a0)
@@ -308,36 +308,36 @@ define void @bitreverse_v8i32(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    lui a1, 16
 ; CHECK-NEXT:    addi a1, a1, -256
-; CHECK-NEXT:    vand.vx v10, v10, a1
+; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 24
+; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vor.vv v10, v10, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a1
-; CHECK-NEXT:    vsll.vi v12, v12, 8
+; CHECK-NEXT:    lui a1, 61681
+; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vsll.vi v8, v8, 24
+; CHECK-NEXT:    vsll.vi v12, v12, 8
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    lui a1, 61681
-; CHECK-NEXT:    addi a1, a1, -241
-; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v10, v10, a1
+; CHECK-NEXT:    lui a1, 209715
+; CHECK-NEXT:    addi a1, a1, 819
 ; CHECK-NEXT:    vsll.vi v8, v8, 4
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    lui a1, 209715
-; CHECK-NEXT:    addi a1, a1, 819
-; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v10, v10, a1
+; CHECK-NEXT:    lui a1, 349525
+; CHECK-NEXT:    addi a1, a1, 1365
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a1, 349525
-; CHECK-NEXT:    addi a1, a1, 1365
-; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vse32.v v8, (a0)
@@ -365,65 +365,65 @@ define void @bitreverse_v4i64(ptr %x, ptr %y) {
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    lui a1, 1044480
-; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    lui a2, 1044480
+; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 16
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    sw a2, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsrl.vx v10, v8, a1
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    vsrl.vx v12, v8, a2
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v12, v12, a3
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsrl.vi v12, v8, 24
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vlse64.v v14, (a4), zero
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v12, v12, a4
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vand.vv v16, v16, v14
-; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsll.vx v12, v8, a1
-; RV32-NEXT:    vand.vx v16, v8, a3
-; RV32-NEXT:    vsll.vx v16, v16, a2
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    vand.vx v16, v8, a4
-; RV32-NEXT:    vsll.vi v16, v16, 24
-; RV32-NEXT:    vand.vv v8, v8, v14
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    addi a2, a5, -256
+; RV32-NEXT:    vlse64.v v10, (a6), zero
+; RV32-NEXT:    vsrl.vx v12, v8, a3
+; RV32-NEXT:    vsrl.vx v14, v8, a4
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vsll.vx v18, v8, a3
+; RV32-NEXT:    vand.vx v14, v14, a2
+; RV32-NEXT:    vor.vv v14, v14, v12
+; RV32-NEXT:    vand.vx v12, v8, a2
+; RV32-NEXT:    vsll.vx v12, v12, a4
+; RV32-NEXT:    vor.vv v12, v18, v12
+; RV32-NEXT:    vsrl.vi v18, v8, 8
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vand.vv v18, v18, v10
+; RV32-NEXT:    vor.vv v16, v18, v16
+; RV32-NEXT:    lui a2, 61681
+; RV32-NEXT:    lui a3, 209715
+; RV32-NEXT:    lui a4, 349525
+; RV32-NEXT:    addi a2, a2, -241
+; RV32-NEXT:    addi a3, a3, 819
+; RV32-NEXT:    addi a4, a4, 1365
+; RV32-NEXT:    vor.vv v14, v16, v14
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a2
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v10, v8, v10
+; RV32-NEXT:    vand.vx v8, v8, a1
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v10, v10, 8
 ; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v10, a3
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsll.vi v8, v8, 4
-; RV32-NEXT:    vor.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 2
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v12, a4
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vor.vv v8, v8, v14
+; RV32-NEXT:    vsrl.vi v14, v8, 4
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v14, v14, v16
+; RV32-NEXT:    vsll.vi v8, v8, 4
+; RV32-NEXT:    vor.vv v8, v14, v8
+; RV32-NEXT:    vsrl.vi v14, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v14, v10
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vse64.v v8, (a0)
@@ -434,60 +434,60 @@ define void @bitreverse_v4i64(ptr %x, ptr %y) {
 ; RV64-LABEL: bitreverse_v4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    vle64.v v14, (a0)
 ; RV64-NEXT:    li a1, 56
-; RV64-NEXT:    vsrl.vx v10, v8, a1
 ; RV64-NEXT:    li a2, 40
-; RV64-NEXT:    vsrl.vx v12, v8, a2
 ; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v12, v12, a3
-; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vsrl.vi v12, v8, 24
 ; RV64-NEXT:    lui a4, 4080
-; RV64-NEXT:    vand.vx v12, v12, a4
-; RV64-NEXT:    vsrl.vi v14, v8, 8
 ; RV64-NEXT:    li a5, 255
+; RV64-NEXT:    addiw a3, a3, -256
 ; RV64-NEXT:    slli a5, a5, 24
-; RV64-NEXT:    vand.vx v14, v14, a5
-; RV64-NEXT:    vor.vv v12, v14, v12
+; RV64-NEXT:    vsrl.vx v8, v14, a1
+; RV64-NEXT:    vsrl.vx v10, v14, a2
+; RV64-NEXT:    vsrl.vi v12, v14, 24
+; RV64-NEXT:    vsrl.vi v16, v14, 8
+; RV64-NEXT:    vand.vx v10, v10, a3
+; RV64-NEXT:    vor.vv v8, v10, v8
+; RV64-NEXT:    vand.vx v18, v14, a5
+; RV64-NEXT:    vand.vx v10, v12, a4
+; RV64-NEXT:    vand.vx v12, v16, a5
 ; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vand.vx v12, v8, a5
-; RV64-NEXT:    vsll.vi v12, v12, 8
-; RV64-NEXT:    vand.vx v14, v8, a4
-; RV64-NEXT:    vsll.vi v14, v14, 24
-; RV64-NEXT:    vor.vv v12, v14, v12
-; RV64-NEXT:    vsll.vx v14, v8, a1
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vsll.vx v8, v8, a2
-; RV64-NEXT:    vor.vv v8, v14, v8
-; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vsrl.vi v10, v8, 4
+; RV64-NEXT:    vand.vx v12, v14, a4
+; RV64-NEXT:    vsll.vi v16, v18, 8
+; RV64-NEXT:    vsll.vi v12, v12, 24
+; RV64-NEXT:    vor.vv v12, v12, v16
+; RV64-NEXT:    vsll.vx v16, v14, a1
+; RV64-NEXT:    vand.vx v14, v14, a3
+; RV64-NEXT:    vsll.vx v14, v14, a2
+; RV64-NEXT:    vor.vv v14, v16, v14
 ; RV64-NEXT:    lui a1, 61681
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 349525
 ; RV64-NEXT:    addiw a1, a1, -241
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v10, v10, a1
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, 1365
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    vor.vv v8, v10, v8
+; RV64-NEXT:    vor.vv v10, v14, v12
+; RV64-NEXT:    vor.vv v8, v10, v8
+; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v10, v10, a1
 ; RV64-NEXT:    vsll.vi v8, v8, 4
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 2
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    addiw a1, a1, 819
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v10, v10, a1
-; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vand.vx v10, v10, a2
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v10, v10, a1
-; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vand.vx v10, v10, a3
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vor.vv v8, v10, v8
 ; RV64-NEXT:    vse64.v v8, (a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
index 8bfdf9b6884a27..d765e4c0b8f6a9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
@@ -136,9 +136,9 @@ define <2 x i32> @vp_bswap_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsll.vi v10, v10, 8
@@ -178,9 +178,9 @@ define <4 x i32> @vp_bswap_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vi v10, v8, 24
 ; CHECK-NEXT:    vor.vv v9, v9, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsll.vi v10, v10, 8
@@ -220,9 +220,9 @@ define <8 x i32> @vp_bswap_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsrl.vi v12, v8, 24
 ; CHECK-NEXT:    vor.vv v10, v10, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsll.vi v12, v12, 8
@@ -262,9 +262,9 @@ define <16 x i32> @vp_bswap_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 8
 ; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    addi a0, a0, -256
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsrl.vi v16, v8, 24
 ; CHECK-NEXT:    vor.vv v12, v12, v16
 ; CHECK-NEXT:    vand.vx v16, v8, a0
 ; CHECK-NEXT:    vsll.vi v16, v16, 8
@@ -284,38 +284,38 @@ define <2 x i64> @vp_bswap_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v9, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v10, v10, a3, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v9, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v10, v8, a1, v0.t
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v11, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v11, v11, 24, v0.t
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
+; RV32-NEXT:    vsll.vx v10, v10, a4, v0.t
+; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vand.vx v10, v8, a5, v0.t
+; RV32-NEXT:    vsll.vi v10, v10, 24, v0.t
+; RV32-NEXT:    vand.vv v12, v8, v11, v0.t
 ; RV32-NEXT:    vsll.vi v12, v12, 8, v0.t
-; RV32-NEXT:    vor.vv v11, v11, v12, v0.t
-; RV32-NEXT:    vor.vv v9, v9, v11, v0.t
-; RV32-NEXT:    vsrl.vx v11, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v12, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a2, v0.t
-; RV32-NEXT:    vor.vv v11, v12, v11, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsrl.vx v10, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v12, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
+; RV32-NEXT:    vor.vv v10, v12, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v12, v12, a4, v0.t
+; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v11, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -324,31 +324,31 @@ define <2 x i64> @vp_bswap_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV64-LABEL: vp_bswap_v2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v9, v9, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v10, v10, 8, v0.t
 ; RV64-NEXT:    vor.vv v9, v9, v10, v0.t
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v10, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v11, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v11, v11, a4, v0.t
+; RV64-NEXT:    vsll.vx v10, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v11, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v11, v11, a5, v0.t
 ; RV64-NEXT:    vor.vv v10, v10, v11, v0.t
 ; RV64-NEXT:    vor.vv v9, v10, v9, v0.t
-; RV64-NEXT:    vsrl.vx v10, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v11, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v11, v11, a3, v0.t
+; RV64-NEXT:    vsrl.vx v10, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v11, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v11, v11, a0, v0.t
 ; RV64-NEXT:    vor.vv v10, v11, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v11, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v11, v11, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v11, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
@@ -363,39 +363,39 @@ define <2 x i64> @vp_bswap_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v9, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsll.vx v9, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v10, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v10, v10, a3
-; RV32-NEXT:    vor.vv v9, v9, v10
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v10, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v11, v8, a2
+; RV32-NEXT:    vsrl.vx v12, v8, a4
+; RV32-NEXT:    vand.vx v13, v8, a1
+; RV32-NEXT:    vand.vx v12, v12, a1
+; RV32-NEXT:    vor.vv v11, v12, v11
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vlse64.v v10, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v12, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v11, v8, a4
-; RV32-NEXT:    vsll.vi v11, v11, 24
-; RV32-NEXT:    vand.vv v12, v8, v10
+; RV32-NEXT:    vsll.vx v13, v13, a4
+; RV32-NEXT:    vor.vv v10, v10, v13
+; RV32-NEXT:    vsrl.vi v13, v8, 8
+; RV32-NEXT:    vand.vx v9, v9, a5
+; RV32-NEXT:    vand.vv v13, v13, v12
+; RV32-NEXT:    vor.vv v9, v13, v9
+; RV32-NEXT:    vand.vv v12, v8, v12
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v12, v12, 8
-; RV32-NEXT:    vor.vv v11, v11, v12
-; RV32-NEXT:    vor.vv v9, v9, v11
-; RV32-NEXT:    vsrl.vx v11, v8, a1
-; RV32-NEXT:    vsrl.vx v12, v8, a3
-; RV32-NEXT:    vand.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v11, v12, v11
-; RV32-NEXT:    vsrl.vi v12, v8, 24
-; RV32-NEXT:    vand.vx v12, v12, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v8, v11
-; RV32-NEXT:    vor.vv v8, v9, v8
+; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vor.vv v9, v9, v11
+; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
@@ -403,34 +403,34 @@ define <2 x i64> @vp_bswap_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV64-LABEL: vp_bswap_v2i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV64-NEXT:    vand.vx v9, v8, a1
-; RV64-NEXT:    vsll.vi v9, v9, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v10, v8, a0
-; RV64-NEXT:    vsll.vi v10, v10, 8
-; RV64-NEXT:    vor.vv v9, v9, v10
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v10, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v11, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v11, v11, a4
-; RV64-NEXT:    vor.vv v10, v10, v11
+; RV64-NEXT:    vsrl.vi v9, v8, 24
+; RV64-NEXT:    vsrl.vi v10, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v11, v8, a3
+; RV64-NEXT:    vsrl.vx v12, v8, a5
+; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    vor.vv v11, v12, v11
+; RV64-NEXT:    vand.vx v12, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v9, v9, a1
+; RV64-NEXT:    vsll.vi v12, v12, 24
+; RV64-NEXT:    vand.vx v10, v10, a2
 ; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsrl.vx v10, v8, a2
-; RV64-NEXT:    vsrl.vx v11, v8, a4
-; RV64-NEXT:    vand.vx v11, v11, a3
-; RV64-NEXT:    vor.vv v10, v11, v10
-; RV64-NEXT:    vsrl.vi v11, v8, 24
-; RV64-NEXT:    vand.vx v11, v11, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v10, v8, a2
+; RV64-NEXT:    vsll.vi v10, v10, 8
+; RV64-NEXT:    vor.vv v10, v12, v10
+; RV64-NEXT:    vsll.vx v12, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v11
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v12, v8
 ; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    vor.vv v8, v9, v8
+; RV64-NEXT:    vor.vv v9, v9, v11
+; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.bswap.v2i64(<2 x i64> %va, <2 x i1> splat (i1 true), i32 %evl)
   ret <2 x i64> %v
@@ -444,38 +444,38 @@ define <4 x i64> @vp_bswap_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vx v10, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v12, v12, a3, v0.t
-; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v10, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v12, v8, a1, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v14, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v14, v8, a4, v0.t
-; RV32-NEXT:    vsll.vi v14, v14, 24, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
+; RV32-NEXT:    vsll.vx v12, v12, a4, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vand.vx v12, v8, a5, v0.t
+; RV32-NEXT:    vsll.vi v12, v12, 24, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v14, v0.t
 ; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT:    vor.vv v14, v14, v16, v0.t
-; RV32-NEXT:    vor.vv v10, v10, v14, v0.t
-; RV32-NEXT:    vsrl.vx v14, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a2, v0.t
-; RV32-NEXT:    vor.vv v14, v16, v14, v0.t
+; RV32-NEXT:    vor.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vsrl.vx v12, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV32-NEXT:    vor.vv v12, v16, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v16, v16, a4, v0.t
+; RV32-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v14, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v14, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -484,31 +484,31 @@ define <4 x i64> @vp_bswap_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV64-LABEL: vp_bswap_v4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v10, v10, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v12, v12, 8, v0.t
 ; RV64-NEXT:    vor.vv v10, v10, v12, v0.t
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v12, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v14, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v14, v14, a4, v0.t
+; RV64-NEXT:    vsll.vx v12, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v14, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v14, v14, a5, v0.t
 ; RV64-NEXT:    vor.vv v12, v12, v14, v0.t
 ; RV64-NEXT:    vor.vv v10, v12, v10, v0.t
-; RV64-NEXT:    vsrl.vx v12, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v14, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v14, v14, a3, v0.t
+; RV64-NEXT:    vsrl.vx v12, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v14, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v14, v14, a0, v0.t
 ; RV64-NEXT:    vor.vv v12, v14, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v14, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v14, v14, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v14, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
@@ -523,39 +523,39 @@ define <4 x i64> @vp_bswap_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vsrl.vi v10, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsll.vx v10, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v12, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v12, v12, a3
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v12, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v14, v8, a2
+; RV32-NEXT:    vsrl.vx v16, v8, a4
+; RV32-NEXT:    vand.vx v18, v8, a1
+; RV32-NEXT:    vand.vx v16, v16, a1
+; RV32-NEXT:    vor.vv v14, v16, v14
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v16, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v14, v8, a4
-; RV32-NEXT:    vsll.vi v14, v14, 24
-; RV32-NEXT:    vand.vv v16, v8, v12
+; RV32-NEXT:    vsll.vx v18, v18, a4
+; RV32-NEXT:    vor.vv v12, v12, v18
+; RV32-NEXT:    vsrl.vi v18, v8, 8
+; RV32-NEXT:    vand.vx v10, v10, a5
+; RV32-NEXT:    vand.vv v18, v18, v16
+; RV32-NEXT:    vor.vv v10, v18, v10
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v16, v16, 8
-; RV32-NEXT:    vor.vv v14, v14, v16
-; RV32-NEXT:    vor.vv v10, v10, v14
-; RV32-NEXT:    vsrl.vx v14, v8, a1
-; RV32-NEXT:    vsrl.vx v16, v8, a3
-; RV32-NEXT:    vand.vx v16, v16, a2
-; RV32-NEXT:    vor.vv v14, v16, v14
-; RV32-NEXT:    vsrl.vi v16, v8, 24
-; RV32-NEXT:    vand.vx v16, v16, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vor.vv v8, v8, v14
-; RV32-NEXT:    vor.vv v8, v10, v8
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vor.vv v10, v10, v14
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
@@ -563,34 +563,34 @@ define <4 x i64> @vp_bswap_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV64-LABEL: vp_bswap_v4i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV64-NEXT:    vand.vx v10, v8, a1
-; RV64-NEXT:    vsll.vi v10, v10, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v12, v8, a0
-; RV64-NEXT:    vsll.vi v12, v12, 8
-; RV64-NEXT:    vor.vv v10, v10, v12
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v12, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v14, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v14, v14, a4
-; RV64-NEXT:    vor.vv v12, v12, v14
+; RV64-NEXT:    vsrl.vi v10, v8, 24
+; RV64-NEXT:    vsrl.vi v12, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v14, v8, a3
+; RV64-NEXT:    vsrl.vx v16, v8, a5
+; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vor.vv v14, v16, v14
+; RV64-NEXT:    vand.vx v16, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v10, v10, a1
+; RV64-NEXT:    vsll.vi v16, v16, 24
+; RV64-NEXT:    vand.vx v12, v12, a2
 ; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vsrl.vx v12, v8, a2
-; RV64-NEXT:    vsrl.vx v14, v8, a4
-; RV64-NEXT:    vand.vx v14, v14, a3
-; RV64-NEXT:    vor.vv v12, v14, v12
-; RV64-NEXT:    vsrl.vi v14, v8, 24
-; RV64-NEXT:    vand.vx v14, v14, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v12, v8, a2
+; RV64-NEXT:    vsll.vi v12, v12, 8
+; RV64-NEXT:    vor.vv v12, v16, v12
+; RV64-NEXT:    vsll.vx v16, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v14
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v16, v8
 ; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    vor.vv v8, v10, v8
+; RV64-NEXT:    vor.vv v10, v10, v14
+; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.bswap.v4i64(<4 x i64> %va, <4 x i1> splat (i1 true), i32 %evl)
   ret <4 x i64> %v
@@ -604,34 +604,34 @@ define <8 x i64> @vp_bswap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vx v12, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v16, v16, a3, v0.t
-; RV32-NEXT:    vor.vv v16, v12, v16, v0.t
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v20, v8, a1, v0.t
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v12, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v12, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v8, a4, v0.t
+; RV32-NEXT:    vsll.vx v20, v20, a4, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
+; RV32-NEXT:    vand.vx v20, v8, a5, v0.t
 ; RV32-NEXT:    vsll.vi v20, v20, 24, v0.t
 ; RV32-NEXT:    vand.vv v24, v8, v12, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV32-NEXT:    vor.vv v20, v20, v24, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
-; RV32-NEXT:    vsrl.vx v20, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v20, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
@@ -644,31 +644,31 @@ define <8 x i64> @vp_bswap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV64-LABEL: vp_bswap_v8i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v12, v12, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v16, v16, 8, v0.t
 ; RV64-NEXT:    vor.vv v12, v12, v16, v0.t
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v16, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v20, v8, a3, v0.t
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v20, v20, a4, v0.t
+; RV64-NEXT:    vsll.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v20, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v20, v20, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v20, v0.t
 ; RV64-NEXT:    vor.vv v12, v16, v12, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v20, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v20, v20, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v20, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v20, v20, a0, v0.t
 ; RV64-NEXT:    vor.vv v16, v20, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v20, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v20, v20, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v20, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
@@ -683,39 +683,39 @@ define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsrl.vi v12, v8, 24
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsll.vx v12, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v16, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v16, v16, a3
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v16, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v20, v8, a2
+; RV32-NEXT:    vsrl.vx v24, v8, a4
+; RV32-NEXT:    vand.vx v28, v8, a1
+; RV32-NEXT:    vand.vx v24, v24, a1
+; RV32-NEXT:    vor.vv v20, v24, v20
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v24, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vx v20, v8, a4
-; RV32-NEXT:    vsll.vi v20, v20, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsll.vx v28, v28, a4
+; RV32-NEXT:    vor.vv v16, v16, v28
+; RV32-NEXT:    vsrl.vi v28, v8, 8
+; RV32-NEXT:    vand.vx v12, v12, a5
+; RV32-NEXT:    vand.vv v28, v28, v24
+; RV32-NEXT:    vor.vv v12, v28, v12
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v20, v20, v24
-; RV32-NEXT:    vor.vv v12, v12, v20
-; RV32-NEXT:    vsrl.vx v20, v8, a1
-; RV32-NEXT:    vsrl.vx v24, v8, a3
-; RV32-NEXT:    vand.vx v24, v24, a2
-; RV32-NEXT:    vor.vv v20, v24, v20
-; RV32-NEXT:    vsrl.vi v24, v8, 24
-; RV32-NEXT:    vand.vx v24, v24, a4
-; RV32-NEXT:    vsrl.vi v8, v8, 8
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vor.vv v8, v8, v20
-; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v12, v12, v20
+; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
@@ -723,34 +723,34 @@ define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV64-LABEL: vp_bswap_v8i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV64-NEXT:    vand.vx v12, v8, a1
-; RV64-NEXT:    vsll.vi v12, v12, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v16, v8, a0
-; RV64-NEXT:    vsll.vi v16, v16, 8
-; RV64-NEXT:    vor.vv v12, v12, v16
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v16, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v20, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v20, v20, a4
-; RV64-NEXT:    vor.vv v16, v16, v20
+; RV64-NEXT:    vsrl.vi v12, v8, 24
+; RV64-NEXT:    vsrl.vi v16, v8, 8
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v20, v8, a3
+; RV64-NEXT:    vsrl.vx v24, v8, a5
+; RV64-NEXT:    vand.vx v24, v24, a0
+; RV64-NEXT:    vor.vv v20, v24, v20
+; RV64-NEXT:    vand.vx v24, v8, a1
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v12, v12, a1
+; RV64-NEXT:    vsll.vi v24, v24, 24
+; RV64-NEXT:    vand.vx v16, v16, a2
 ; RV64-NEXT:    vor.vv v12, v16, v12
-; RV64-NEXT:    vsrl.vx v16, v8, a2
-; RV64-NEXT:    vsrl.vx v20, v8, a4
-; RV64-NEXT:    vand.vx v20, v20, a3
-; RV64-NEXT:    vor.vv v16, v20, v16
-; RV64-NEXT:    vsrl.vi v20, v8, 24
-; RV64-NEXT:    vand.vx v20, v20, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vsll.vx v24, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v20
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v24, v8
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    vor.vv v8, v12, v8
+; RV64-NEXT:    vor.vv v12, v12, v20
+; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.bswap.v8i64(<8 x i64> %va, <8 x i1> splat (i1 true), i32 %evl)
   ret <8 x i64> %v
@@ -769,33 +769,33 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
@@ -814,14 +814,14 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
@@ -857,36 +857,35 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV64-NEXT:    addi a5, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
@@ -909,51 +908,59 @@ define <15 x i64> @vp_bswap_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v24, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    vsrl.vx v0, v8, a4
+; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a1
+; RV32-NEXT:    vsll.vx v0, v0, a4
+; RV32-NEXT:    vor.vv v16, v24, v0
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v0, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v0, v8, a4
-; RV32-NEXT:    vsll.vi v0, v0, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a5
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v0, v8, a3
-; RV32-NEXT:    vand.vx v0, v0, a2
-; RV32-NEXT:    vsrl.vx v24, v8, a1
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -962,35 +969,51 @@ define <15 x i64> @vp_bswap_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_bswap_v15i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0
-; RV64-NEXT:    vsll.vi v24, v24, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v0, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v0, v0, a4
-; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vx v24, v8, a2
-; RV64-NEXT:    vsrl.vx v0, v8, a4
-; RV64-NEXT:    vand.vx v0, v0, a3
+; RV64-NEXT:    vsrl.vi v24, v8, 24
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v16, v8, a3
+; RV64-NEXT:    vsrl.vx v0, v8, a5
+; RV64-NEXT:    vand.vx v0, v0, a0
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v0, v8, 8
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v0, v0, a2
 ; RV64-NEXT:    vor.vv v24, v0, v24
-; RV64-NEXT:    vsrl.vi v0, v8, 24
-; RV64-NEXT:    vand.vx v0, v0, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v0, v8, a1
+; RV64-NEXT:    vsll.vi v0, v0, 24
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    vsll.vx v0, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v0
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v0, v8
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.bswap.v15i64(<15 x i64> %va, <15 x i1> splat (i1 true), i32 %evl)
   ret <15 x i64> %v
@@ -1009,33 +1032,33 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    addi a5, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1, v0.t
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2, v0.t
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
+; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v16, (a5), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
@@ -1054,14 +1077,14 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vsrl.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
-; RV32-NEXT:    vand.vx v24, v24, a4, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
@@ -1097,36 +1120,35 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    addiw a0, a4, -256
 ; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2, v0.t
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV64-NEXT:    vsll.vx v16, v16, a4, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
-; RV64-NEXT:    addi a5, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
-; RV64-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; RV64-NEXT:    vsrl.vx v24, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
 ; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
@@ -1149,51 +1171,59 @@ define <16 x i64> @vp_bswap_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsll.vx v16, v8, a1
-; RV32-NEXT:    lui a2, 16
-; RV32-NEXT:    addi a2, a2, -256
-; RV32-NEXT:    vand.vx v24, v8, a2
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    vsll.vx v24, v24, a3
-; RV32-NEXT:    vor.vv v16, v16, v24
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a4, sp, 8
+; RV32-NEXT:    vsll.vx v24, v8, a2
+; RV32-NEXT:    addi a1, a3, -256
+; RV32-NEXT:    vsrl.vx v16, v8, a2
+; RV32-NEXT:    vsrl.vx v0, v8, a4
+; RV32-NEXT:    vand.vx v0, v0, a1
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vx v0, v8, a1
+; RV32-NEXT:    vsll.vx v0, v0, a4
+; RV32-NEXT:    vor.vv v16, v24, v0
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a4), zero
-; RV32-NEXT:    lui a4, 4080
+; RV32-NEXT:    vlse64.v v0, (a6), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v0, v8, a4
-; RV32-NEXT:    vsll.vi v0, v0, 24
-; RV32-NEXT:    vand.vv v24, v8, v16
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vand.vx v16, v16, a5
+; RV32-NEXT:    vsrl.vi v24, v8, 8
+; RV32-NEXT:    vand.vv v24, v24, v0
+; RV32-NEXT:    vor.vv v16, v24, v16
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
 ; RV32-NEXT:    vsll.vi v24, v24, 8
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v0, v8, a3
-; RV32-NEXT:    vand.vx v0, v0, a2
-; RV32-NEXT:    vsrl.vx v24, v8, a1
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v8, 8
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsrl.vi v8, v8, 24
-; RV32-NEXT:    vand.vx v8, v8, a4
-; RV32-NEXT:    vor.vv v8, v16, v8
 ; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v8, v16, v8
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v8, v24, v8
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v16, v24
+; RV32-NEXT:    vor.vv v8, v8, v16
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1202,35 +1232,51 @@ define <16 x i64> @vp_bswap_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_bswap_v16i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    lui a1, 4080
+; RV64-NEXT:    li a2, 255
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    lui a4, 16
+; RV64-NEXT:    li a5, 40
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vand.vx v16, v8, a1
-; RV64-NEXT:    vsll.vi v16, v16, 24
-; RV64-NEXT:    li a0, 255
-; RV64-NEXT:    slli a0, a0, 24
-; RV64-NEXT:    vand.vx v24, v8, a0
-; RV64-NEXT:    vsll.vi v24, v24, 8
-; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    li a2, 56
-; RV64-NEXT:    vsll.vx v24, v8, a2
-; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v0, v8, a3
-; RV64-NEXT:    li a4, 40
-; RV64-NEXT:    vsll.vx v0, v0, a4
-; RV64-NEXT:    vor.vv v24, v24, v0
-; RV64-NEXT:    vor.vv v16, v24, v16
-; RV64-NEXT:    vsrl.vx v24, v8, a2
-; RV64-NEXT:    vsrl.vx v0, v8, a4
-; RV64-NEXT:    vand.vx v0, v0, a3
+; RV64-NEXT:    vsrl.vi v24, v8, 24
+; RV64-NEXT:    addiw a0, a4, -256
+; RV64-NEXT:    vsrl.vx v16, v8, a3
+; RV64-NEXT:    vsrl.vx v0, v8, a5
+; RV64-NEXT:    vand.vx v0, v0, a0
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsrl.vi v0, v8, 8
+; RV64-NEXT:    slli a2, a2, 24
+; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v0, v0, a2
 ; RV64-NEXT:    vor.vv v24, v0, v24
-; RV64-NEXT:    vsrl.vi v0, v8, 24
-; RV64-NEXT:    vand.vx v0, v0, a1
-; RV64-NEXT:    vsrl.vi v8, v8, 8
+; RV64-NEXT:    vand.vx v0, v8, a1
+; RV64-NEXT:    vsll.vi v0, v0, 24
+; RV64-NEXT:    vand.vx v16, v8, a2
+; RV64-NEXT:    vsll.vi v16, v16, 8
+; RV64-NEXT:    vor.vv v16, v0, v16
+; RV64-NEXT:    vsll.vx v0, v8, a3
 ; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    vor.vv v8, v8, v0
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vsll.vx v8, v8, a5
+; RV64-NEXT:    vor.vv v8, v0, v8
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vor.vv v16, v24, v16
+; RV64-NEXT:    vor.vv v8, v8, v16
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.bswap.v16i64(<16 x i64> %va, <16 x i1> splat (i1 true), i32 %evl)
   ret <16 x i64> %v

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
index 1dff8aed060541..5e491f21e62134 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll
@@ -35,15 +35,15 @@ define void @bswap_v4i32(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    lui a1, 16
 ; CHECK-NEXT:    addi a1, a1, -256
-; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    vsrl.vi v9, v8, 8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 24
+; CHECK-NEXT:    vand.vx v9, v9, a1
 ; CHECK-NEXT:    vor.vv v9, v9, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a1
-; CHECK-NEXT:    vsll.vi v10, v10, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 24
+; CHECK-NEXT:    vsll.vi v10, v10, 8
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vse32.v v8, (a0)
@@ -72,36 +72,36 @@ define void @bswap_v2i64(ptr %x, ptr %y) {
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    lui a4, 16
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsrl.vx v9, v8, a1
-; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    addi a1, a4, -256
+; RV32-NEXT:    vlse64.v v9, (a6), zero
 ; RV32-NEXT:    vsrl.vx v10, v8, a2
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v10, v10, a3
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsrl.vi v10, v8, 24
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vlse64.v v11, (a4), zero
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v10, v10, a4
-; RV32-NEXT:    vsrl.vi v12, v8, 8
-; RV32-NEXT:    vand.vv v12, v12, v11
+; RV32-NEXT:    vsrl.vx v11, v8, a3
+; RV32-NEXT:    vsrl.vi v12, v8, 24
+; RV32-NEXT:    vsll.vx v13, v8, a2
+; RV32-NEXT:    vand.vx v11, v11, a1
+; RV32-NEXT:    vor.vv v10, v11, v10
+; RV32-NEXT:    vand.vx v11, v8, a1
+; RV32-NEXT:    vsll.vx v11, v11, a3
+; RV32-NEXT:    vor.vv v11, v13, v11
+; RV32-NEXT:    vsrl.vi v13, v8, 8
+; RV32-NEXT:    vand.vx v12, v12, a5
+; RV32-NEXT:    vand.vv v13, v13, v9
+; RV32-NEXT:    vor.vv v12, v13, v12
+; RV32-NEXT:    vand.vv v9, v8, v9
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v9, v9, 8
 ; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vor.vv v9, v10, v9
-; RV32-NEXT:    vsll.vx v10, v8, a1
-; RV32-NEXT:    vand.vx v12, v8, a3
-; RV32-NEXT:    vsll.vx v12, v12, a2
-; RV32-NEXT:    vor.vv v10, v10, v12
-; RV32-NEXT:    vand.vx v12, v8, a4
-; RV32-NEXT:    vsll.vi v12, v12, 24
-; RV32-NEXT:    vand.vv v8, v8, v11
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v12, v8
-; RV32-NEXT:    vor.vv v8, v10, v8
 ; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vor.vv v8, v11, v8
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -112,31 +112,31 @@ define void @bswap_v2i64(ptr %x, ptr %y) {
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    li a1, 56
-; RV64-NEXT:    vsrl.vx v9, v8, a1
 ; RV64-NEXT:    li a2, 40
-; RV64-NEXT:    vsrl.vx v10, v8, a2
 ; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v10, v10, a3
-; RV64-NEXT:    vor.vv v9, v10, v9
-; RV64-NEXT:    vsrl.vi v10, v8, 24
 ; RV64-NEXT:    lui a4, 4080
-; RV64-NEXT:    vand.vx v10, v10, a4
-; RV64-NEXT:    vsrl.vi v11, v8, 8
 ; RV64-NEXT:    li a5, 255
+; RV64-NEXT:    addiw a3, a3, -256
 ; RV64-NEXT:    slli a5, a5, 24
-; RV64-NEXT:    vand.vx v11, v11, a5
-; RV64-NEXT:    vor.vv v10, v11, v10
+; RV64-NEXT:    vsrl.vx v9, v8, a1
+; RV64-NEXT:    vsrl.vx v10, v8, a2
+; RV64-NEXT:    vsrl.vi v11, v8, 24
+; RV64-NEXT:    vsrl.vi v12, v8, 8
+; RV64-NEXT:    vand.vx v10, v10, a3
 ; RV64-NEXT:    vor.vv v9, v10, v9
 ; RV64-NEXT:    vand.vx v10, v8, a5
+; RV64-NEXT:    vand.vx v11, v11, a4
+; RV64-NEXT:    vand.vx v12, v12, a5
+; RV64-NEXT:    vor.vv v11, v12, v11
+; RV64-NEXT:    vand.vx v12, v8, a4
 ; RV64-NEXT:    vsll.vi v10, v10, 8
-; RV64-NEXT:    vand.vx v11, v8, a4
-; RV64-NEXT:    vsll.vi v11, v11, 24
-; RV64-NEXT:    vor.vv v10, v11, v10
-; RV64-NEXT:    vsll.vx v11, v8, a1
+; RV64-NEXT:    vsll.vi v12, v12, 24
+; RV64-NEXT:    vor.vv v10, v12, v10
+; RV64-NEXT:    vsll.vx v12, v8, a1
 ; RV64-NEXT:    vand.vx v8, v8, a3
 ; RV64-NEXT:    vsll.vx v8, v8, a2
-; RV64-NEXT:    vor.vv v8, v11, v8
+; RV64-NEXT:    vor.vv v8, v12, v8
+; RV64-NEXT:    vor.vv v9, v11, v9
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vse64.v v8, (a0)
@@ -188,15 +188,15 @@ define void @bswap_v8i32(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    lui a1, 16
 ; CHECK-NEXT:    addi a1, a1, -256
-; CHECK-NEXT:    vand.vx v10, v10, a1
+; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 24
+; CHECK-NEXT:    vand.vx v10, v10, a1
 ; CHECK-NEXT:    vor.vv v10, v10, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a1
-; CHECK-NEXT:    vsll.vi v12, v12, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 24
+; CHECK-NEXT:    vsll.vi v12, v12, 8
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vse32.v v8, (a0)
@@ -225,36 +225,36 @@ define void @bswap_v4i64(ptr %x, ptr %y) {
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    lui a1, 1044480
+; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    lui a4, 16
+; RV32-NEXT:    lui a5, 4080
+; RV32-NEXT:    addi a6, sp, 8
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw zero, 12(sp)
-; RV32-NEXT:    li a1, 56
-; RV32-NEXT:    vsrl.vx v10, v8, a1
-; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    addi a1, a4, -256
+; RV32-NEXT:    vlse64.v v10, (a6), zero
 ; RV32-NEXT:    vsrl.vx v12, v8, a2
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    vand.vx v12, v12, a3
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsrl.vi v12, v8, 24
-; RV32-NEXT:    addi a4, sp, 8
-; RV32-NEXT:    vlse64.v v14, (a4), zero
-; RV32-NEXT:    lui a4, 4080
-; RV32-NEXT:    vand.vx v12, v12, a4
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vand.vv v16, v16, v14
+; RV32-NEXT:    vsrl.vx v14, v8, a3
+; RV32-NEXT:    vsrl.vi v16, v8, 24
+; RV32-NEXT:    vsll.vx v18, v8, a2
+; RV32-NEXT:    vand.vx v14, v14, a1
+; RV32-NEXT:    vor.vv v12, v14, v12
+; RV32-NEXT:    vand.vx v14, v8, a1
+; RV32-NEXT:    vsll.vx v14, v14, a3
+; RV32-NEXT:    vor.vv v14, v18, v14
+; RV32-NEXT:    vsrl.vi v18, v8, 8
+; RV32-NEXT:    vand.vx v16, v16, a5
+; RV32-NEXT:    vand.vv v18, v18, v10
+; RV32-NEXT:    vor.vv v16, v18, v16
+; RV32-NEXT:    vand.vv v10, v8, v10
+; RV32-NEXT:    vand.vx v8, v8, a5
+; RV32-NEXT:    vsll.vi v8, v8, 24
+; RV32-NEXT:    vsll.vi v10, v10, 8
 ; RV32-NEXT:    vor.vv v12, v16, v12
-; RV32-NEXT:    vor.vv v10, v12, v10
-; RV32-NEXT:    vsll.vx v12, v8, a1
-; RV32-NEXT:    vand.vx v16, v8, a3
-; RV32-NEXT:    vsll.vx v16, v16, a2
-; RV32-NEXT:    vor.vv v12, v12, v16
-; RV32-NEXT:    vand.vx v16, v8, a4
-; RV32-NEXT:    vsll.vi v16, v16, 24
-; RV32-NEXT:    vand.vv v8, v8, v14
-; RV32-NEXT:    vsll.vi v8, v8, 8
-; RV32-NEXT:    vor.vv v8, v16, v8
-; RV32-NEXT:    vor.vv v8, v12, v8
 ; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v8, v14, v8
+; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    addi sp, sp, 16
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
@@ -265,31 +265,31 @@ define void @bswap_v4i64(ptr %x, ptr %y) {
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    li a1, 56
-; RV64-NEXT:    vsrl.vx v10, v8, a1
 ; RV64-NEXT:    li a2, 40
-; RV64-NEXT:    vsrl.vx v12, v8, a2
 ; RV64-NEXT:    lui a3, 16
-; RV64-NEXT:    addiw a3, a3, -256
-; RV64-NEXT:    vand.vx v12, v12, a3
-; RV64-NEXT:    vor.vv v10, v12, v10
-; RV64-NEXT:    vsrl.vi v12, v8, 24
 ; RV64-NEXT:    lui a4, 4080
-; RV64-NEXT:    vand.vx v12, v12, a4
-; RV64-NEXT:    vsrl.vi v14, v8, 8
 ; RV64-NEXT:    li a5, 255
+; RV64-NEXT:    addiw a3, a3, -256
 ; RV64-NEXT:    slli a5, a5, 24
-; RV64-NEXT:    vand.vx v14, v14, a5
-; RV64-NEXT:    vor.vv v12, v14, v12
+; RV64-NEXT:    vsrl.vx v10, v8, a1
+; RV64-NEXT:    vsrl.vx v12, v8, a2
+; RV64-NEXT:    vsrl.vi v14, v8, 24
+; RV64-NEXT:    vsrl.vi v16, v8, 8
+; RV64-NEXT:    vand.vx v12, v12, a3
 ; RV64-NEXT:    vor.vv v10, v12, v10
 ; RV64-NEXT:    vand.vx v12, v8, a5
+; RV64-NEXT:    vand.vx v14, v14, a4
+; RV64-NEXT:    vand.vx v16, v16, a5
+; RV64-NEXT:    vor.vv v14, v16, v14
+; RV64-NEXT:    vand.vx v16, v8, a4
 ; RV64-NEXT:    vsll.vi v12, v12, 8
-; RV64-NEXT:    vand.vx v14, v8, a4
-; RV64-NEXT:    vsll.vi v14, v14, 24
-; RV64-NEXT:    vor.vv v12, v14, v12
-; RV64-NEXT:    vsll.vx v14, v8, a1
+; RV64-NEXT:    vsll.vi v16, v16, 24
+; RV64-NEXT:    vor.vv v12, v16, v12
+; RV64-NEXT:    vsll.vx v16, v8, a1
 ; RV64-NEXT:    vand.vx v8, v8, a3
 ; RV64-NEXT:    vsll.vx v8, v8, a2
-; RV64-NEXT:    vor.vv v8, v14, v8
+; RV64-NEXT:    vor.vv v8, v16, v8
+; RV64-NEXT:    vor.vv v10, v14, v10
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vse64.v v8, (a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
index 5d75efe681af73..dbbb8362144cab 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
@@ -31,12 +31,12 @@ define <8 x i32> @add_constant_rhs_8xi32(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI1_0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a4
-; CHECK-NEXT:    lui a0, %hi(.LCPI1_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI1_0)
 ; CHECK-NEXT:    vle32.v v10, (a0)
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a5
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a6
@@ -118,22 +118,22 @@ define <4 x i32> @udiv_constant_rhs(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI4_0)
-; CHECK-NEXT:    vle32.v v9, (a0)
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
-; CHECK-NEXT:    vslide1down.vx v8, v8, a2
-; CHECK-NEXT:    vslide1down.vx v8, v8, a3
-; CHECK-NEXT:    vmulhu.vv v9, v8, v9
-; CHECK-NEXT:    vsub.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.i v11, 0
-; CHECK-NEXT:    lui a0, 524288
-; CHECK-NEXT:    vslide1down.vx v11, v11, a0
+; CHECK-NEXT:    lui a1, 524288
+; CHECK-NEXT:    vle32.v v10, (a0)
 ; CHECK-NEXT:    lui a0, %hi(.LCPI4_1)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI4_1)
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vmulhu.vv v10, v10, v11
-; CHECK-NEXT:    vadd.vv v9, v10, v9
+; CHECK-NEXT:    vslide1down.vx v9, v9, a1
+; CHECK-NEXT:    vle32.v v11, (a0)
+; CHECK-NEXT:    vslide1down.vx v8, v8, a2
+; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vmulhu.vv v10, v8, v10
+; CHECK-NEXT:    vsub.vv v12, v8, v10
+; CHECK-NEXT:    vmulhu.vv v9, v12, v9
+; CHECK-NEXT:    vadd.vv v9, v9, v10
 ; CHECK-NEXT:    vmv.v.i v0, 4
-; CHECK-NEXT:    vsrl.vv v9, v9, v12
+; CHECK-NEXT:    vsrl.vv v9, v9, v11
 ; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
 ; CHECK-NEXT:    ret
   %e0 = udiv i32 %a, 23
@@ -224,12 +224,12 @@ define <4 x i32> @add_constant_rhs_with_identity(i32 %a, i32 %b, i32 %c, i32 %d)
 ; RV32-NEXT:    addi a1, a1, 25
 ; RV32-NEXT:    addi a2, a2, 1
 ; RV32-NEXT:    addi a3, a3, 2047
-; RV32-NEXT:    addi a3, a3, 308
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a0
+; RV32-NEXT:    addi a0, a3, 308
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: add_constant_rhs_with_identity:
@@ -237,12 +237,12 @@ define <4 x i32> @add_constant_rhs_with_identity(i32 %a, i32 %b, i32 %c, i32 %d)
 ; RV64-NEXT:    addiw a1, a1, 25
 ; RV64-NEXT:    addiw a2, a2, 1
 ; RV64-NEXT:    addi a3, a3, 2047
-; RV64-NEXT:    addiw a3, a3, 308
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    addiw a0, a3, 308
 ; RV64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-NEXT:    vslide1down.vx v8, v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a3
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %e0 = add i32 %a, 0
   %e1 = add i32 %b, 25
@@ -261,12 +261,12 @@ define <4 x i32> @add_constant_rhs_identity(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; RV32-NEXT:    addi a1, a1, 25
 ; RV32-NEXT:    addi a2, a2, 1
 ; RV32-NEXT:    addi a3, a3, 2047
-; RV32-NEXT:    addi a3, a3, 308
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a0
+; RV32-NEXT:    addi a0, a3, 308
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: add_constant_rhs_identity:
@@ -274,12 +274,12 @@ define <4 x i32> @add_constant_rhs_identity(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; RV64-NEXT:    addiw a1, a1, 25
 ; RV64-NEXT:    addiw a2, a2, 1
 ; RV64-NEXT:    addi a3, a3, 2047
-; RV64-NEXT:    addiw a3, a3, 308
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    addiw a0, a3, 308
 ; RV64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-NEXT:    vslide1down.vx v8, v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a3
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %e0 = add i32 %a, 0
   %e1 = add i32 %b, 25
@@ -562,20 +562,21 @@ define <8 x i32> @add_constant_rhs_8xi32_partial(<8 x i32> %vin, i32 %a, i32 %b,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
 ; CHECK-NEXT:    vmv.s.x v10, a0
+; CHECK-NEXT:    vmv.s.x v12, a1
 ; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vmv.s.x v10, a1
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 5
 ; CHECK-NEXT:    vmv.s.x v10, a2
-; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 6
 ; CHECK-NEXT:    lui a0, %hi(.LCPI19_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI19_0)
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v12, 5
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vmv.s.x v12, a3
-; CHECK-NEXT:    vslideup.vi v8, v12, 7
-; CHECK-NEXT:    vadd.vv v8, v8, v10
+; CHECK-NEXT:    vle32.v v12, (a0)
+; CHECK-NEXT:    vsetivli zero, 7, e32, m2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 6
+; CHECK-NEXT:    vmv.s.x v10, a3
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 7
+; CHECK-NEXT:    vadd.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %vadd = add <8 x i32> %vin, <i32 1, i32 2, i32 3, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
   %e0 = add i32 %a, 23
@@ -598,9 +599,9 @@ define <2 x i32> @build_vec_of_trunc_op(i64 %a, i64 %b) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    slli a1, a1, 31
 ; RV32-NEXT:    srli a0, a0, 1
-; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    slli a3, a3, 31
 ; RV32-NEXT:    srli a2, a2, 1
+; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    or a2, a2, a3
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll
index 3c090bb9003119..ee953a66a004f3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll
@@ -85,16 +85,16 @@ define fastcc <128 x i32> @ret_split_v128i32(ptr %x) {
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    addi a2, a1, 256
-; CHECK-NEXT:    vle32.v v16, (a1)
-; CHECK-NEXT:    addi a1, a1, 384
+; CHECK-NEXT:    vle32.v v16, (a2)
+; CHECK-NEXT:    addi a2, a1, 384
 ; CHECK-NEXT:    vle32.v v24, (a1)
-; CHECK-NEXT:    vle32.v v0, (a2)
-; CHECK-NEXT:    vse32.v v16, (a0)
 ; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vse32.v v24, (a1)
-; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vse32.v v0, (a1)
+; CHECK-NEXT:    vle32.v v0, (a2)
+; CHECK-NEXT:    addi a2, a0, 256
+; CHECK-NEXT:    vse32.v v24, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vse32.v v0, (a1)
+; CHECK-NEXT:    vse32.v v16, (a2)
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %v = load <128 x i32>, ptr %x
@@ -257,9 +257,7 @@ define fastcc <32 x i32> @pass_vector_arg_indirect_stack(<32 x i32> %x, <32 x i3
 ; CHECK-NEXT:    .cfi_def_cfa s0, 0
 ; CHECK-NEXT:    andi sp, sp, -128
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    mv a0, sp
+; CHECK-NEXT:    mv t0, sp
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    li a2, 2
 ; CHECK-NEXT:    li a3, 3
@@ -268,8 +266,10 @@ define fastcc <32 x i32> @pass_vector_arg_indirect_stack(<32 x i32> %x, <32 x i3
 ; CHECK-NEXT:    li a6, 6
 ; CHECK-NEXT:    li a7, 7
 ; CHECK-NEXT:    mv t3, sp
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    li t4, 8
-; CHECK-NEXT:    vse32.v v8, (a0)
+; CHECK-NEXT:    vse32.v v8, (t0)
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    call vector_arg_indirect_stack
@@ -306,19 +306,17 @@ define fastcc <32 x i32> @vector_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3
 define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z) {
 ; CHECK-LABEL: pass_vector_arg_direct_stack:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -160
-; CHECK-NEXT:    .cfi_def_cfa_offset 160
-; CHECK-NEXT:    sd ra, 152(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    addi sp, sp, -176
+; CHECK-NEXT:    .cfi_def_cfa_offset 176
+; CHECK-NEXT:    sd ra, 168(sp) # 8-byte Folded Spill
+; CHECK-NEXT:    sd s0, 160(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset ra, -8
+; CHECK-NEXT:    .cfi_offset s0, -16
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    sd a0, 144(sp)
-; CHECK-NEXT:    li a0, 13
-; CHECK-NEXT:    li t0, 12
+; CHECK-NEXT:    addi t0, sp, 16
+; CHECK-NEXT:    li t1, 1
+; CHECK-NEXT:    li t2, 13
+; CHECK-NEXT:    li s0, 12
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    li a2, 2
 ; CHECK-NEXT:    li a3, 3
@@ -327,17 +325,23 @@ define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32>
 ; CHECK-NEXT:    li a6, 6
 ; CHECK-NEXT:    li a7, 7
 ; CHECK-NEXT:    li t3, 8
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vse32.v v8, (t0)
 ; CHECK-NEXT:    li t4, 9
 ; CHECK-NEXT:    li t5, 10
+; CHECK-NEXT:    sd t1, 144(sp)
 ; CHECK-NEXT:    li t6, 11
-; CHECK-NEXT:    sd t0, 0(sp)
-; CHECK-NEXT:    sd a0, 8(sp)
+; CHECK-NEXT:    sd s0, 0(sp)
+; CHECK-NEXT:    sd t2, 8(sp)
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    call vector_arg_direct_stack
-; CHECK-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld ra, 168(sp) # 8-byte Folded Reload
+; CHECK-NEXT:    ld s0, 160(sp) # 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_restore ra
-; CHECK-NEXT:    addi sp, sp, 160
+; CHECK-NEXT:    .cfi_restore s0
+; CHECK-NEXT:    addi sp, sp, 176
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %s = call fastcc <32 x i32> @vector_arg_direct_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, i32 1)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
index fcdb5d5cb6aef7..73e148edbe2d67 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll
@@ -85,16 +85,16 @@ define <128 x i32> @ret_split_v128i32(ptr %x) {
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    addi a2, a1, 256
-; CHECK-NEXT:    vle32.v v16, (a1)
-; CHECK-NEXT:    addi a1, a1, 384
+; CHECK-NEXT:    vle32.v v16, (a2)
+; CHECK-NEXT:    addi a2, a1, 384
 ; CHECK-NEXT:    vle32.v v24, (a1)
-; CHECK-NEXT:    vle32.v v0, (a2)
-; CHECK-NEXT:    vse32.v v16, (a0)
 ; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vse32.v v24, (a1)
-; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vse32.v v0, (a1)
+; CHECK-NEXT:    vle32.v v0, (a2)
+; CHECK-NEXT:    addi a2, a0, 256
+; CHECK-NEXT:    vse32.v v24, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
+; CHECK-NEXT:    vse32.v v0, (a1)
+; CHECK-NEXT:    vse32.v v16, (a2)
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %v = load <128 x i32>, ptr %x
@@ -312,18 +312,18 @@ define <32 x i32> @pass_vector_arg_via_stack(<32 x i32> %x, <32 x i32> %y, <32 x
 ; CHECK-NEXT:    sd ra, 136(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset ra, -8
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vse32.v v8, (sp)
-; CHECK-NEXT:    li a0, 8
+; CHECK-NEXT:    li t0, 8
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    li a2, 2
 ; CHECK-NEXT:    li a3, 3
 ; CHECK-NEXT:    li a4, 4
 ; CHECK-NEXT:    li a5, 5
 ; CHECK-NEXT:    li a6, 6
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vse32.v v8, (sp)
 ; CHECK-NEXT:    li a7, 7
-; CHECK-NEXT:    sd a0, 128(sp)
+; CHECK-NEXT:    sd t0, 128(sp)
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    call vector_arg_via_stack
@@ -358,25 +358,27 @@ define <4 x i1> @pass_vector_mask_arg_via_stack(<4 x i1> %v) {
 ; CHECK-NEXT:    sd ra, 152(sp) # 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset ra, -8
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vse32.v v8, (sp)
-; CHECK-NEXT:    li a0, 8
-; CHECK-NEXT:    sd a0, 128(sp)
+; CHECK-NEXT:    li a1, 8
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v17, 0
+; CHECK-NEXT:    addi a2, sp, 136
+; CHECK-NEXT:    li a5, 5
+; CHECK-NEXT:    li a6, 6
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    sd a1, 128(sp)
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vse32.v v8, (sp)
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
 ; CHECK-NEXT:    vmv.v.v v17, v16
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmsne.vi v16, v17, 0
-; CHECK-NEXT:    addi a0, sp, 136
-; CHECK-NEXT:    li a5, 5
-; CHECK-NEXT:    li a6, 6
 ; CHECK-NEXT:    li a7, 7
-; CHECK-NEXT:    vsm.v v16, (a0)
+; CHECK-NEXT:    vsm.v v16, (a2)
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    li a1, 0
 ; CHECK-NEXT:    li a2, 0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
index 29f437829f3be0..511242aa677c2a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
@@ -32,10 +32,10 @@ define <2 x half> @vp_ceil_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -72,10 +72,10 @@ define <2 x half> @vp_ceil_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -114,10 +114,10 @@ define <4 x half> @vp_ceil_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -154,10 +154,10 @@ define <4 x half> @vp_ceil_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -197,10 +197,10 @@ define <8 x half> @vp_ceil_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -238,10 +238,10 @@ define <8 x half> @vp_ceil_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -261,9 +261,9 @@ declare <16 x half> @llvm.vp.ceil.v16f16(<16 x half>, <16 x i1>, i32)
 define <16 x half> @vp_ceil_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_ceil_v16f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -283,10 +283,10 @@ define <16 x half> @vp_ceil_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %e
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
@@ -324,10 +324,10 @@ define <16 x half> @vp_ceil_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -561,9 +561,9 @@ declare <4 x double> @llvm.vp.ceil.v4f64(<4 x double>, <4 x i1>, i32)
 define <4 x double> @vp_ceil_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v4f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a1)
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -605,9 +605,9 @@ declare <8 x double> @llvm.vp.ceil.v8f64(<8 x double>, <8 x i1>, i32)
 define <8 x double> @vp_ceil_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v8f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a1)
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
@@ -649,9 +649,9 @@ declare <15 x double> @llvm.vp.ceil.v15f64(<15 x double>, <15 x i1>, i32)
 define <15 x double> @vp_ceil_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v15f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -693,9 +693,9 @@ declare <16 x double> @llvm.vp.ceil.v16f64(<16 x double>, <16 x i1>, i32)
 define <16 x double> @vp_ceil_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v16f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -737,59 +737,69 @@ declare <32 x double> @llvm.vp.ceil.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v6, v0
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a1, 3
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -808,27 +818,30 @@ define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB27_2:
-; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
+; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
+; CHECK-NEXT:    addi a2, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a2
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    fsrmi a2, 3
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a1, 3
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrm a2
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
index e2d7ed55c46016..9d0d42cf754c5e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
@@ -11,6 +11,7 @@ define <2 x i8> @vp_ctlz_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -18,10 +19,9 @@ define <2 x i8> @vp_ctlz_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -39,6 +39,7 @@ define <2 x i8> @vp_ctlz_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
@@ -46,10 +47,9 @@ define <2 x i8> @vp_ctlz_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -69,6 +69,7 @@ define <4 x i8> @vp_ctlz_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -76,10 +77,9 @@ define <4 x i8> @vp_ctlz_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -97,6 +97,7 @@ define <4 x i8> @vp_ctlz_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
@@ -104,10 +105,9 @@ define <4 x i8> @vp_ctlz_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -127,6 +127,7 @@ define <8 x i8> @vp_ctlz_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -134,10 +135,9 @@ define <8 x i8> @vp_ctlz_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -155,6 +155,7 @@ define <8 x i8> @vp_ctlz_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
@@ -162,10 +163,9 @@ define <8 x i8> @vp_ctlz_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -185,6 +185,7 @@ define <16 x i8> @vp_ctlz_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -192,10 +193,9 @@ define <16 x i8> @vp_ctlz_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -213,6 +213,7 @@ define <16 x i8> @vp_ctlz_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
@@ -220,10 +221,9 @@ define <16 x i8> @vp_ctlz_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -243,7 +243,9 @@ define <2 x i16> @vp_ctlz_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -252,20 +254,18 @@ define <2 x i16> @vp_ctlz_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -280,7 +280,9 @@ define <2 x i16> @vp_ctlz_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -289,20 +291,18 @@ define <2 x i16> @vp_ctlz_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -319,7 +319,9 @@ define <4 x i16> @vp_ctlz_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -328,20 +330,18 @@ define <4 x i16> @vp_ctlz_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -356,7 +356,9 @@ define <4 x i16> @vp_ctlz_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -365,20 +367,18 @@ define <4 x i16> @vp_ctlz_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -395,7 +395,9 @@ define <8 x i16> @vp_ctlz_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -404,20 +406,18 @@ define <8 x i16> @vp_ctlz_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -432,7 +432,9 @@ define <8 x i16> @vp_ctlz_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -441,20 +443,18 @@ define <8 x i16> @vp_ctlz_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -471,7 +471,9 @@ define <16 x i16> @vp_ctlz_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
@@ -480,20 +482,18 @@ define <16 x i16> @vp_ctlz_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -508,7 +508,9 @@ define <16 x i16> @vp_ctlz_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v10
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
@@ -517,20 +519,18 @@ define <16 x i16> @vp_ctlz_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -547,7 +547,9 @@ define <2 x i32> @vp_ctlz_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -558,20 +560,18 @@ define <2 x i32> @vp_ctlz_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -587,7 +587,9 @@ define <2 x i32> @vp_ctlz_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -598,20 +600,18 @@ define <2 x i32> @vp_ctlz_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -629,7 +629,9 @@ define <4 x i32> @vp_ctlz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -640,20 +642,18 @@ define <4 x i32> @vp_ctlz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -669,7 +669,9 @@ define <4 x i32> @vp_ctlz_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -680,20 +682,18 @@ define <4 x i32> @vp_ctlz_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -711,7 +711,9 @@ define <8 x i32> @vp_ctlz_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
@@ -722,20 +724,18 @@ define <8 x i32> @vp_ctlz_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -751,7 +751,9 @@ define <8 x i32> @vp_ctlz_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v10
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
@@ -762,20 +764,18 @@ define <8 x i32> @vp_ctlz_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -793,7 +793,9 @@ define <16 x i32> @vp_ctlz_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
@@ -804,20 +806,18 @@ define <16 x i32> @vp_ctlz_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -833,7 +833,9 @@ define <16 x i32> @vp_ctlz_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v12
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
@@ -844,20 +846,18 @@ define <16 x i32> @vp_ctlz_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -875,6 +875,12 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -884,49 +890,60 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v9, v8, a1, v0.t
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -936,38 +953,21 @@ define <2 x i64> @vp_ctlz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v9, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.ctlz.v2i64(<2 x i64> %va, i1 false, <2 x i1> %m, i32 %evl)
@@ -979,6 +979,12 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vsrl.vi v9, v8, 2
 ; RV32-NEXT:    vor.vv v8, v8, v9
@@ -988,40 +994,34 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vsrl.vi v9, v8, 16
 ; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v9, v8, a1
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1031,6 +1031,23 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v9
@@ -1040,37 +1057,20 @@ define <2 x i64> @vp_ctlz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v9, v8, a0
+; RV64-NEXT:    vsrl.vx v9, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vx v9, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1084,59 +1084,76 @@ define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v4i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 2, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 8, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 16, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vx v12, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v10, v12, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
@@ -1146,38 +1163,21 @@ define <4 x i64> @vp_ctlz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v10, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.ctlz.v4i64(<4 x i64> %va, i1 false, <4 x i1> %m, i32 %evl)
@@ -1189,6 +1189,12 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vsrl.vi v10, v8, 2
 ; RV32-NEXT:    vor.vv v8, v8, v10
@@ -1198,40 +1204,34 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vsrl.vi v10, v8, 16
 ; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v10, v8, a1
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1241,6 +1241,23 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vi v10, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v10
@@ -1250,37 +1267,20 @@ define <4 x i64> @vp_ctlz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vi v10, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v10, v8, a0
+; RV64-NEXT:    vsrl.vx v10, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vx v10, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1294,59 +1294,76 @@ define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vp_ctlz_v8i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v12, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vnot.v v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v12, v8, v12, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsub.vv v12, v16, v12, v0.t
+; RV32-NEXT:    vand.vv v16, v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v12, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_v8i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
@@ -1356,38 +1373,21 @@ define <8 x i64> @vp_ctlz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v12, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.ctlz.v8i64(<8 x i64> %va, i1 false, <8 x i1> %m, i32 %evl)
@@ -1399,6 +1399,12 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vsrl.vi v12, v8, 2
 ; RV32-NEXT:    vor.vv v8, v8, v12
@@ -1408,40 +1414,34 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vsrl.vi v12, v8, 16
 ; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v12, v8, a1
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1451,6 +1451,23 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vsrl.vi v12, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v12
@@ -1460,37 +1477,20 @@ define <8 x i64> @vp_ctlz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vsrl.vi v12, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v12, v8, a0
+; RV64-NEXT:    vsrl.vx v12, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vx v12, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1522,11 +1522,21 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1536,58 +1546,52 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vnot.v v24, v8, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vnot.v v16, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -1601,6 +1605,23 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1610,38 +1631,21 @@ define <15 x i64> @vp_ctlz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl)
@@ -1666,46 +1670,48 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vx v0, v8, a1
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vnot.v v0, v8
+; RV32-NEXT:    vsrl.vi v8, v0, 1
+; RV32-NEXT:    vand.vv v24, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v24, v0, v24
+; RV32-NEXT:    vand.vv v0, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v24, 2
+; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1717,6 +1723,23 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v16
@@ -1726,37 +1749,20 @@ define <15 x i64> @vp_ctlz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0
+; RV64-NEXT:    vsrl.vx v16, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1788,11 +1794,21 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1802,58 +1818,52 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vnot.v v24, v8, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vnot.v v16, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -1867,6 +1877,23 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1876,38 +1903,21 @@ define <16 x i64> @vp_ctlz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> %va, i1 false, <16 x i1> %m, i32 %evl)
@@ -1932,46 +1942,48 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vx v0, v8, a1
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vnot.v v0, v8
+; RV32-NEXT:    vsrl.vi v8, v0, 1
+; RV32-NEXT:    vand.vv v24, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v24, v0, v24
+; RV32-NEXT:    vand.vv v0, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v24, 2
+; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1983,6 +1995,23 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v16
@@ -1992,37 +2021,20 @@ define <16 x i64> @vp_ctlz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0
+; RV64-NEXT:    vsrl.vx v16, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -2050,29 +2062,32 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v0, 2
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:    addi a2, a2, 257
+; RV32-NEXT:    sw a2, 16(sp)
+; RV32-NEXT:    sw a2, 20(sp)
 ; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a3, .LBB34_2
+; RV32-NEXT:    bltu a0, a1, .LBB34_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB34_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    addi a3, sp, 40
+; RV32-NEXT:    addi a4, sp, 32
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -2082,37 +2097,31 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    mul a3, a3, a5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 40
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vnot.v v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a4), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
@@ -2373,6 +2382,28 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:  .LBB34_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    addiw a3, a3, 819
+; RV64-NEXT:    addiw a6, a4, -241
+; RV64-NEXT:    addiw a7, a5, 257
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a5, a2, a5
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a4, a3, a4
+; RV64-NEXT:    slli a2, a6, 32
+; RV64-NEXT:    add a2, a6, a2
+; RV64-NEXT:    slli a3, a7, 32
+; RV64-NEXT:    add a3, a7, a3
+; RV64-NEXT:    addi a6, a0, -16
+; RV64-NEXT:    sltu a0, a0, a6
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a6, a0, a6
+; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -2382,52 +2413,30 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    li a1, 32
 ; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    addi a7, sp, 16
 ; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
 ; RV64-NEXT:    vmv1r.v v0, v24
 ; RV64-NEXT:    csrr a7, vlenb
 ; RV64-NEXT:    slli a7, a7, 3
 ; RV64-NEXT:    add a7, sp, a7
 ; RV64-NEXT:    addi a7, a7, 16
 ; RV64-NEXT:    vl8r.v v8, (a7) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v16, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v16, 2, v0.t
@@ -2442,17 +2451,17 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a0, vlenb
@@ -2475,113 +2484,144 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    li a3, 16
+; RV32-NEXT:    addi a1, a2, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB35_2
+; RV32-NEXT:    bltu a0, a3, .LBB35_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB35_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsrl.vx v24, v8, a2
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vnot.v v0, v8
 ; RV32-NEXT:    addi a3, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    vlse64.v v24, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, a0, -16
+; RV32-NEXT:    sltu a0, a0, a3
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 1
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 2
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v0, v8, a2
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 8
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vnot.v v0, v8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v16, 16
+; RV32-NEXT:    vor.vv v16, v16, v8
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v0, 1
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v0, 1
-; RV32-NEXT:    vand.vv v24, v24, v16
 ; RV32-NEXT:    vsub.vv v24, v0, v24
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v0, v16, a2
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v0, v24, v8
 ; RV32-NEXT:    vsrl.vi v24, v24, 2
 ; RV32-NEXT:    vand.vv v24, v24, v8
 ; RV32-NEXT:    vadd.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 4
-; RV32-NEXT:    vadd.vv v24, v24, v0
-; RV32-NEXT:    addi a3, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, a0, -16
-; RV32-NEXT:    sltu a0, a0, a3
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v0, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v0, 1
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 2
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vsrl.vi v0, v24, 4
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vsrl.vi v0, v24, 8
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vsrl.vi v0, v24, 16
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vsrl.vx v0, v24, a2
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vnot.v v24, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
+; RV32-NEXT:    vnot.v v16, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 1
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v0, v0, v24
 ; RV32-NEXT:    addi a2, sp, 24
-; RV32-NEXT:    vsub.vv v16, v24, v16
-; RV32-NEXT:    vand.vv v24, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v16, 2
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vsub.vv v0, v16, v0
+; RV32-NEXT:    addi a4, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v24, 4
+; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v0, v8
+; RV32-NEXT:    vsrl.vi v0, v0, 2
+; RV32-NEXT:    vand.vv v8, v0, v8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a2), zero
-; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vlse64.v v0, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a2), zero
+; RV32-NEXT:    vlse64.v v24, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v0, v16
+; RV32-NEXT:    vand.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v0, v24
+; RV32-NEXT:    vmul.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v24, v8, v24
 ; RV32-NEXT:    li a2, 56
@@ -2607,78 +2647,100 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:  .LBB35_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    li a2, 32
+; RV64-NEXT:    lui a3, 349525
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    addiw a7, a3, 1365
+; RV64-NEXT:    addiw a3, a4, 819
+; RV64-NEXT:    addiw a4, a5, -241
+; RV64-NEXT:    addiw a6, a6, 257
+; RV64-NEXT:    slli a5, a7, 32
+; RV64-NEXT:    add a7, a7, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a5, a3, a5
+; RV64-NEXT:    slli a3, a4, 32
+; RV64-NEXT:    add a3, a4, a3
+; RV64-NEXT:    slli a4, a6, 32
+; RV64-NEXT:    add a4, a6, a4
+; RV64-NEXT:    addi a6, a0, -16
+; RV64-NEXT:    sltu a0, a0, a6
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a6, a0, a6
+; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vsrl.vi v24, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    li a1, 32
-; RV64-NEXT:    vsrl.vx v24, v8, a1
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 8
+; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 2
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 16
+; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v24, v8, a2
+; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 8
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vnot.v v8, v8
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 16
 ; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsrl.vx v24, v16, a1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    vand.vx v24, v24, a7
+; RV64-NEXT:    vsub.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v24, v16, a2
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v8, a5
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vnot.v v16, v16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v8, v8, a5
+; RV64-NEXT:    vadd.vv v8, v24, v8
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    vand.vx v24, v24, a7
 ; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a3
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v16, a5
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v16, v16, a5
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v8, v8, a4
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vmul.vx v16, v16, a5
-; RV64-NEXT:    vsrl.vx v16, v16, a6
+; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vmul.vx v16, v16, a4
+; RV64-NEXT:    vsrl.vx v16, v16, a0
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 false, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
@@ -2689,6 +2751,7 @@ define <2 x i8> @vp_ctlz_zero_undef_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -2696,10 +2759,9 @@ define <2 x i8> @vp_ctlz_zero_undef_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -2717,6 +2779,7 @@ define <2 x i8> @vp_ctlz_zero_undef_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
@@ -2724,10 +2787,9 @@ define <2 x i8> @vp_ctlz_zero_undef_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2745,6 +2807,7 @@ define <4 x i8> @vp_ctlz_zero_undef_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -2752,10 +2815,9 @@ define <4 x i8> @vp_ctlz_zero_undef_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -2773,6 +2835,7 @@ define <4 x i8> @vp_ctlz_zero_undef_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
@@ -2780,10 +2843,9 @@ define <4 x i8> @vp_ctlz_zero_undef_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2801,6 +2863,7 @@ define <8 x i8> @vp_ctlz_zero_undef_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -2808,10 +2871,9 @@ define <8 x i8> @vp_ctlz_zero_undef_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -2829,6 +2891,7 @@ define <8 x i8> @vp_ctlz_zero_undef_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
@@ -2836,10 +2899,9 @@ define <8 x i8> @vp_ctlz_zero_undef_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2857,6 +2919,7 @@ define <16 x i8> @vp_ctlz_zero_undef_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zero
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -2864,10 +2927,9 @@ define <16 x i8> @vp_ctlz_zero_undef_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zero
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -2885,6 +2947,7 @@ define <16 x i8> @vp_ctlz_zero_undef_v16i8_unmasked(<16 x i8> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
@@ -2892,10 +2955,9 @@ define <16 x i8> @vp_ctlz_zero_undef_v16i8_unmasked(<16 x i8> %va, i32 zeroext %
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2913,7 +2975,9 @@ define <2 x i16> @vp_ctlz_zero_undef_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -2922,20 +2986,18 @@ define <2 x i16> @vp_ctlz_zero_undef_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -2950,7 +3012,9 @@ define <2 x i16> @vp_ctlz_zero_undef_v2i16_unmasked(<2 x i16> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -2959,20 +3023,18 @@ define <2 x i16> @vp_ctlz_zero_undef_v2i16_unmasked(<2 x i16> %va, i32 zeroext %
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -2987,7 +3049,9 @@ define <4 x i16> @vp_ctlz_zero_undef_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -2996,20 +3060,18 @@ define <4 x i16> @vp_ctlz_zero_undef_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -3024,7 +3086,9 @@ define <4 x i16> @vp_ctlz_zero_undef_v4i16_unmasked(<4 x i16> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -3033,20 +3097,18 @@ define <4 x i16> @vp_ctlz_zero_undef_v4i16_unmasked(<4 x i16> %va, i32 zeroext %
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -3061,7 +3123,9 @@ define <8 x i16> @vp_ctlz_zero_undef_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -3070,20 +3134,18 @@ define <8 x i16> @vp_ctlz_zero_undef_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -3098,7 +3160,9 @@ define <8 x i16> @vp_ctlz_zero_undef_v8i16_unmasked(<8 x i16> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -3107,20 +3171,18 @@ define <8 x i16> @vp_ctlz_zero_undef_v8i16_unmasked(<8 x i16> %va, i32 zeroext %
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -3135,7 +3197,9 @@ define <16 x i16> @vp_ctlz_zero_undef_v16i16(<16 x i16> %va, <16 x i1> %m, i32 z
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
@@ -3144,20 +3208,18 @@ define <16 x i16> @vp_ctlz_zero_undef_v16i16(<16 x i16> %va, <16 x i1> %m, i32 z
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -3172,7 +3234,9 @@ define <16 x i16> @vp_ctlz_zero_undef_v16i16_unmasked(<16 x i16> %va, i32 zeroex
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vor.vv v8, v8, v10
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
@@ -3181,20 +3245,18 @@ define <16 x i16> @vp_ctlz_zero_undef_v16i16_unmasked(<16 x i16> %va, i32 zeroex
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -3209,7 +3271,9 @@ define <2 x i32> @vp_ctlz_zero_undef_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -3220,20 +3284,18 @@ define <2 x i32> @vp_ctlz_zero_undef_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -3249,7 +3311,9 @@ define <2 x i32> @vp_ctlz_zero_undef_v2i32_unmasked(<2 x i32> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -3260,20 +3324,18 @@ define <2 x i32> @vp_ctlz_zero_undef_v2i32_unmasked(<2 x i32> %va, i32 zeroext %
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -3289,7 +3351,9 @@ define <4 x i32> @vp_ctlz_zero_undef_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
@@ -3300,20 +3364,18 @@ define <4 x i32> @vp_ctlz_zero_undef_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -3329,7 +3391,9 @@ define <4 x i32> @vp_ctlz_zero_undef_v4i32_unmasked(<4 x i32> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v9, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
@@ -3340,20 +3404,18 @@ define <4 x i32> @vp_ctlz_zero_undef_v4i32_unmasked(<4 x i32> %va, i32 zeroext %
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -3369,7 +3431,9 @@ define <8 x i32> @vp_ctlz_zero_undef_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
@@ -3380,20 +3444,18 @@ define <8 x i32> @vp_ctlz_zero_undef_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -3409,7 +3471,9 @@ define <8 x i32> @vp_ctlz_zero_undef_v8i32_unmasked(<8 x i32> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v10
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v10, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
@@ -3420,20 +3484,18 @@ define <8 x i32> @vp_ctlz_zero_undef_v8i32_unmasked(<8 x i32> %va, i32 zeroext %
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -3449,7 +3511,9 @@ define <16 x i32> @vp_ctlz_zero_undef_v16i32(<16 x i32> %va, <16 x i1> %m, i32 z
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
@@ -3460,20 +3524,18 @@ define <16 x i32> @vp_ctlz_zero_undef_v16i32(<16 x i32> %va, <16 x i1> %m, i32 z
 ; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -3489,7 +3551,9 @@ define <16 x i32> @vp_ctlz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroex
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vor.vv v8, v8, v12
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vsrl.vi v12, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
@@ -3500,20 +3564,18 @@ define <16 x i32> @vp_ctlz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroex
 ; CHECK-NEXT:    vor.vv v8, v8, v12
 ; CHECK-NEXT:    vnot.v v8, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -3529,6 +3591,12 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -3538,49 +3606,60 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v9, v8, a1, v0.t
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
@@ -3590,38 +3669,21 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v9, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.ctlz.v2i64(<2 x i64> %va, i1 true, <2 x i1> %m, i32 %evl)
@@ -3633,6 +3695,12 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vsrl.vi v9, v8, 2
 ; RV32-NEXT:    vor.vv v8, v8, v9
@@ -3642,40 +3710,34 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vsrl.vi v9, v8, 16
 ; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v9, v8, a1
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vor.vv v8, v8, v9
 ; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v9, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
 ; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3685,6 +3747,23 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v9
@@ -3694,37 +3773,20 @@ define <2 x i64> @vp_ctlz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v9, v8, a0
+; RV64-NEXT:    vsrl.vx v9, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v9
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vx v9, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3736,59 +3798,76 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
 ; RV32-LABEL: vp_ctlz_zero_undef_v4i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 2, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 8, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 16, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vx v12, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v10, v12, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
@@ -3798,38 +3877,21 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v10, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.ctlz.v4i64(<4 x i64> %va, i1 true, <4 x i1> %m, i32 %evl)
@@ -3840,7 +3902,13 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 ; RV32-LABEL: vp_ctlz_zero_undef_v4i64_unmasked:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vsrl.vi v10, v8, 2
 ; RV32-NEXT:    vor.vv v8, v8, v10
@@ -3850,40 +3918,34 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vsrl.vi v10, v8, 16
 ; RV32-NEXT:    vor.vv v8, v8, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v10, v8, a1
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
 ; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3893,6 +3955,23 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vi v10, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v10
@@ -3902,37 +3981,20 @@ define <4 x i64> @vp_ctlz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vi v10, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v10
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v10, v8, a0
+; RV64-NEXT:    vsrl.vx v10, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v10
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vx v10, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3944,59 +4006,76 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
 ; RV32-LABEL: vp_ctlz_zero_undef_v8i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 2, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 8, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 16, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v12, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vnot.v v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vand.vv v12, v8, v12, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsub.vv v12, v16, v12, v0.t
+; RV32-NEXT:    vand.vv v16, v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v12, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v8i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
@@ -4006,38 +4085,21 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v12, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.ctlz.v8i64(<8 x i64> %va, i1 true, <8 x i1> %m, i32 %evl)
@@ -4049,6 +4111,12 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vsrl.vi v12, v8, 2
 ; RV32-NEXT:    vor.vv v8, v8, v12
@@ -4058,40 +4126,34 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vsrl.vi v12, v8, 16
 ; RV32-NEXT:    vor.vv v8, v8, v12
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v12, v8, a1
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
-; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
 ; RV32-NEXT:    vsub.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v16
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -4101,6 +4163,23 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vsrl.vi v12, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v12
@@ -4110,37 +4189,20 @@ define <8 x i64> @vp_ctlz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vsrl.vi v12, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v12
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v12, v8, a0
+; RV64-NEXT:    vsrl.vx v12, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v12
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vx v12, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -4170,11 +4232,21 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -4184,58 +4256,52 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vnot.v v24, v8, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vnot.v v16, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -4249,6 +4315,23 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -4258,38 +4341,21 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.ctlz.v15i64(<15 x i64> %va, i1 true, <15 x i1> %m, i32 %evl)
@@ -4314,46 +4380,48 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vx v0, v8, a1
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vnot.v v0, v8
+; RV32-NEXT:    vsrl.vi v8, v0, 1
+; RV32-NEXT:    vand.vv v24, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v24, v0, v24
+; RV32-NEXT:    vand.vv v0, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v24, 2
+; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -4365,6 +4433,23 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v16
@@ -4374,37 +4459,20 @@ define <15 x i64> @vp_ctlz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0
+; RV64-NEXT:    vsrl.vx v16, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -4434,11 +4502,21 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -4448,58 +4526,52 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vnot.v v24, v8, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 40
+; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vnot.v v16, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -4513,6 +4585,23 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -4522,38 +4611,21 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.ctlz.v16i64(<16 x i64> %va, i1 true, <16 x i1> %m, i32 %evl)
@@ -4578,46 +4650,48 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vsrl.vi v16, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsrl.vx v16, v8, a1
-; RV32-NEXT:    vor.vv v8, v8, v16
-; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vx v0, v8, a1
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vnot.v v0, v8
+; RV32-NEXT:    vsrl.vi v8, v0, 1
+; RV32-NEXT:    vand.vv v24, v8, v24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v24, v0, v24
+; RV32-NEXT:    vand.vv v0, v24, v16
+; RV32-NEXT:    vsrl.vi v24, v24, 2
+; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -4629,6 +4703,23 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
+; RV64-NEXT:    lui a0, 349525
+; RV64-NEXT:    lui a1, 209715
+; RV64-NEXT:    lui a2, 61681
+; RV64-NEXT:    lui a3, 4112
+; RV64-NEXT:    addiw a0, a0, 1365
+; RV64-NEXT:    addiw a1, a1, 819
+; RV64-NEXT:    addiw a2, a2, -241
+; RV64-NEXT:    addiw a3, a3, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    li a4, 32
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v16
@@ -4638,37 +4729,20 @@ define <16 x i64> @vp_ctlz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 16
 ; RV64-NEXT:    vor.vv v8, v8, v16
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsrl.vx v16, v8, a0
+; RV64-NEXT:    vsrl.vx v16, v8, a4
 ; RV64-NEXT:    vor.vv v8, v8, v16
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -4694,29 +4768,32 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v0, 2
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:    addi a2, a2, 257
+; RV32-NEXT:    sw a2, 16(sp)
+; RV32-NEXT:    sw a2, 20(sp)
 ; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a3, .LBB70_2
+; RV32-NEXT:    bltu a0, a1, .LBB70_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB70_2:
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    addi a3, sp, 40
+; RV32-NEXT:    addi a4, sp, 32
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -4726,37 +4803,31 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a5, 40
+; RV32-NEXT:    mul a3, a3, a5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 40
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vnot.v v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a4), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 48
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
@@ -5017,6 +5088,28 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:  .LBB70_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    addiw a3, a3, 819
+; RV64-NEXT:    addiw a6, a4, -241
+; RV64-NEXT:    addiw a7, a5, 257
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a5, a2, a5
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a4, a3, a4
+; RV64-NEXT:    slli a2, a6, 32
+; RV64-NEXT:    add a2, a6, a2
+; RV64-NEXT:    slli a3, a7, 32
+; RV64-NEXT:    add a3, a7, a3
+; RV64-NEXT:    addi a6, a0, -16
+; RV64-NEXT:    sltu a0, a0, a6
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a6, a0, a6
+; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -5026,52 +5119,30 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    li a1, 32
 ; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    addi a7, sp, 16
 ; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
 ; RV64-NEXT:    vmv1r.v v0, v24
 ; RV64-NEXT:    csrr a7, vlenb
 ; RV64-NEXT:    slli a7, a7, 3
 ; RV64-NEXT:    add a7, sp, a7
 ; RV64-NEXT:    addi a7, a7, 16
 ; RV64-NEXT:    vl8r.v v8, (a7) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV64-NEXT:    vor.vv v16, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v16, 2, v0.t
@@ -5086,17 +5157,17 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a0, vlenb
@@ -5119,113 +5190,144 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    li a3, 16
+; RV32-NEXT:    addi a1, a2, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB71_2
+; RV32-NEXT:    bltu a0, a3, .LBB71_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB71_2:
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 1
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 2
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 4
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 8
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vsrl.vi v24, v8, 16
-; RV32-NEXT:    vor.vv v8, v8, v24
+; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    vsrl.vx v24, v8, a2
-; RV32-NEXT:    vor.vv v8, v8, v24
-; RV32-NEXT:    vnot.v v0, v8
 ; RV32-NEXT:    addi a3, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    vlse64.v v24, (a3), zero
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, a0, -16
+; RV32-NEXT:    sltu a0, a0, a3
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a3
 ; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 2
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v0, v8, 4
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 1
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 8
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 2
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 16
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v0, v8, a2
+; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 8
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vnot.v v0, v8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v16, 16
+; RV32-NEXT:    vor.vv v16, v16, v8
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v0, 1
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v0, 1
-; RV32-NEXT:    vand.vv v24, v24, v16
 ; RV32-NEXT:    vsub.vv v24, v0, v24
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vx v0, v16, a2
+; RV32-NEXT:    vor.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v0, v24, v8
 ; RV32-NEXT:    vsrl.vi v24, v24, 2
 ; RV32-NEXT:    vand.vv v24, v24, v8
 ; RV32-NEXT:    vadd.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 4
-; RV32-NEXT:    vadd.vv v24, v24, v0
-; RV32-NEXT:    addi a3, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, a0, -16
-; RV32-NEXT:    sltu a0, a0, a3
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v0, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v0, 1
-; RV32-NEXT:    vor.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 2
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vsrl.vi v0, v24, 4
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vsrl.vi v0, v24, 8
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vsrl.vi v0, v24, 16
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vsrl.vx v0, v24, a2
-; RV32-NEXT:    vor.vv v24, v24, v0
-; RV32-NEXT:    vnot.v v24, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
+; RV32-NEXT:    vnot.v v16, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 1
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v0, v0, v24
 ; RV32-NEXT:    addi a2, sp, 24
-; RV32-NEXT:    vsub.vv v16, v24, v16
-; RV32-NEXT:    vand.vv v24, v16, v8
-; RV32-NEXT:    vsrl.vi v16, v16, 2
-; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vsub.vv v0, v16, v0
+; RV32-NEXT:    addi a4, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v24, 4
+; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v0, v8
+; RV32-NEXT:    vsrl.vi v0, v0, 2
+; RV32-NEXT:    vand.vv v8, v0, v8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a2), zero
-; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vlse64.v v0, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v24, v8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a2), zero
+; RV32-NEXT:    vlse64.v v24, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v0, v16
+; RV32-NEXT:    vand.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v0
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v0, v24
+; RV32-NEXT:    vmul.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v24, v8, v24
 ; RV32-NEXT:    li a2, 56
@@ -5251,78 +5353,100 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV64-NEXT:  .LBB71_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    li a2, 32
+; RV64-NEXT:    lui a3, 349525
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    addiw a7, a3, 1365
+; RV64-NEXT:    addiw a3, a4, 819
+; RV64-NEXT:    addiw a4, a5, -241
+; RV64-NEXT:    addiw a6, a6, 257
+; RV64-NEXT:    slli a5, a7, 32
+; RV64-NEXT:    add a7, a7, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a5, a3, a5
+; RV64-NEXT:    slli a3, a4, 32
+; RV64-NEXT:    add a3, a4, a3
+; RV64-NEXT:    slli a4, a6, 32
+; RV64-NEXT:    add a4, a6, a4
+; RV64-NEXT:    addi a6, a0, -16
+; RV64-NEXT:    sltu a0, a0, a6
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a6, a0, a6
+; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vsrl.vi v24, v8, 2
 ; RV64-NEXT:    vor.vv v8, v8, v24
 ; RV64-NEXT:    vsrl.vi v24, v8, 4
 ; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 8
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 16
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    li a1, 32
-; RV64-NEXT:    vsrl.vx v24, v8, a1
-; RV64-NEXT:    vor.vv v8, v8, v24
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v24, v24, a2
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 8
+; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 2
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 16
+; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v24, v8, a2
+; RV64-NEXT:    vor.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 8
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vnot.v v8, v8
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 16
 ; RV64-NEXT:    vor.vv v16, v16, v24
-; RV64-NEXT:    vsrl.vx v24, v16, a1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    vand.vx v24, v24, a7
+; RV64-NEXT:    vsub.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v24, v16, a2
 ; RV64-NEXT:    vor.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v8, a5
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vnot.v v16, v16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v8, v8, a5
+; RV64-NEXT:    vadd.vv v8, v24, v8
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    vand.vx v24, v24, a7
 ; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a3
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v16, a5
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v16, v16, a5
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v8, v8, a4
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v8, v8, a0
+; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
 ; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vmul.vx v16, v16, a5
-; RV64-NEXT:    vsrl.vx v16, v16, a6
+; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vmul.vx v16, v16, a4
+; RV64-NEXT:    vsrl.vx v16, v16, a0
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 true, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
index 228a9f0d6d5221..4bd4a9a854f365 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
@@ -13,6 +13,7 @@ define void @ctlz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RVI-NEXT:    vle8.v v8, (a0)
+; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 2
@@ -21,10 +22,9 @@ define void @ctlz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    li a1, 51
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
@@ -39,12 +39,12 @@ define void @ctlz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVF-NEXT:    vle8.v v8, (a0)
+; RVF-NEXT:    li a1, 134
 ; RVF-NEXT:    vzext.vf2 v10, v8
 ; RVF-NEXT:    vfwcvt.f.xu.v v12, v10
 ; RVF-NEXT:    vnsrl.wi v8, v12, 23
 ; RVF-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; RVF-NEXT:    vnsrl.wi v10, v8, 0
-; RVF-NEXT:    li a1, 134
 ; RVF-NEXT:    vrsub.vx v8, v10, a1
 ; RVF-NEXT:    li a1, 8
 ; RVF-NEXT:    vminu.vx v8, v8, a1
@@ -55,12 +55,12 @@ define void @ctlz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVD-NEXT:    vle8.v v8, (a0)
+; RVD-NEXT:    li a1, 134
 ; RVD-NEXT:    vzext.vf2 v10, v8
 ; RVD-NEXT:    vfwcvt.f.xu.v v12, v10
 ; RVD-NEXT:    vnsrl.wi v8, v12, 23
 ; RVD-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; RVD-NEXT:    vnsrl.wi v10, v8, 0
-; RVD-NEXT:    li a1, 134
 ; RVD-NEXT:    vrsub.vx v8, v10, a1
 ; RVD-NEXT:    li a1, 8
 ; RVD-NEXT:    vminu.vx v8, v8, a1
@@ -87,6 +87,8 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RVI-NEXT:    vle16.v v8, (a0)
+; RVI-NEXT:    lui a1, 5
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 2
@@ -97,20 +99,18 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    lui a1, 5
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    lui a1, 3
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 1
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v9, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v9
-; RVI-NEXT:    lui a1, 1
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    li a1, 257
 ; RVI-NEXT:    vmul.vx v8, v8, a1
@@ -122,9 +122,9 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RVF-NEXT:    vle16.v v8, (a0)
+; RVF-NEXT:    li a1, 142
 ; RVF-NEXT:    vfwcvt.f.xu.v v10, v8
 ; RVF-NEXT:    vnsrl.wi v8, v10, 23
-; RVF-NEXT:    li a1, 142
 ; RVF-NEXT:    vrsub.vx v8, v8, a1
 ; RVF-NEXT:    li a1, 16
 ; RVF-NEXT:    vminu.vx v8, v8, a1
@@ -135,9 +135,9 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RVD-NEXT:    vle16.v v8, (a0)
+; RVD-NEXT:    li a1, 142
 ; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
 ; RVD-NEXT:    vnsrl.wi v8, v10, 23
-; RVD-NEXT:    li a1, 142
 ; RVD-NEXT:    vrsub.vx v8, v8, a1
 ; RVD-NEXT:    li a1, 16
 ; RVD-NEXT:    vminu.vx v8, v8, a1
@@ -164,6 +164,8 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RVI-NEXT:    vle32.v v8, (a0)
+; RVI-NEXT:    lui a1, 349525
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 2
@@ -176,20 +178,18 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    lui a1, 349525
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    lui a1, 209715
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 61681
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v9, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v9
-; RVI-NEXT:    lui a1, 61681
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    lui a1, 4112
 ; RVI-NEXT:    addi a1, a1, 257
@@ -205,8 +205,8 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vfcvt.f.xu.v v8, v8
 ; RVF-NEXT:    fsrm a1
-; RVF-NEXT:    vsrl.vi v8, v8, 23
 ; RVF-NEXT:    li a1, 158
+; RVF-NEXT:    vsrl.vi v8, v8, 23
 ; RVF-NEXT:    vrsub.vx v8, v8, a1
 ; RVF-NEXT:    li a1, 32
 ; RVF-NEXT:    vminu.vx v8, v8, a1
@@ -217,8 +217,8 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RVD-NEXT:    vle32.v v8, (a0)
-; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
 ; RVD-NEXT:    li a1, 52
+; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
 ; RVD-NEXT:    vnsrl.wx v8, v10, a1
 ; RVD-NEXT:    li a1, 1054
 ; RVD-NEXT:    vrsub.vx v8, v8, a1
@@ -247,50 +247,50 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 2
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 8
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 16
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    li a1, 32
-; RV32I-NEXT:    vsrl.vx v9, v8, a1
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
 ; RV32I-NEXT:    lui a1, 349525
 ; RV32I-NEXT:    addi a1, a1, 1365
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
+; RV32I-NEXT:    vmv.v.x v9, a1
+; RV32I-NEXT:    li a1, 32
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 2
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 8
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 16
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vx v10, v8, a1
 ; RV32I-NEXT:    lui a1, 209715
 ; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vand.vv v9, v10, v9
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    lui a1, 4112
 ; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -301,6 +301,23 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    addiw a4, a4, 257
+; RV64I-NEXT:    slli a5, a1, 32
+; RV64I-NEXT:    add a1, a1, a5
+; RV64I-NEXT:    slli a5, a2, 32
+; RV64I-NEXT:    add a2, a2, a5
+; RV64I-NEXT:    slli a5, a3, 32
+; RV64I-NEXT:    add a3, a3, a5
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    li a5, 32
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 2
@@ -311,37 +328,20 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v9
-; RV64I-NEXT:    li a1, 32
-; RV64I-NEXT:    vsrl.vx v9, v8, a1
+; RV64I-NEXT:    vsrl.vx v9, v8, a5
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v9, v9, a1
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v9, v8, a1
+; RV64I-NEXT:    vand.vx v9, v8, a2
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a2
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a3
+; RV64I-NEXT:    vmul.vx v8, v8, a4
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -401,6 +401,7 @@ define void @ctlz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    li a1, 32
 ; RVI-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; RVI-NEXT:    vle8.v v8, (a0)
+; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 2
@@ -409,10 +410,9 @@ define void @ctlz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    li a1, 51
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
@@ -428,12 +428,12 @@ define void @ctlz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    li a1, 32
 ; RVF-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; RVF-NEXT:    vle8.v v8, (a0)
+; RVF-NEXT:    li a1, 134
 ; RVF-NEXT:    vzext.vf2 v12, v8
 ; RVF-NEXT:    vfwcvt.f.xu.v v16, v12
 ; RVF-NEXT:    vnsrl.wi v8, v16, 23
 ; RVF-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; RVF-NEXT:    vnsrl.wi v12, v8, 0
-; RVF-NEXT:    li a1, 134
 ; RVF-NEXT:    vrsub.vx v8, v12, a1
 ; RVF-NEXT:    li a1, 8
 ; RVF-NEXT:    vminu.vx v8, v8, a1
@@ -445,12 +445,12 @@ define void @ctlz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    li a1, 32
 ; RVD-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; RVD-NEXT:    vle8.v v8, (a0)
+; RVD-NEXT:    li a1, 134
 ; RVD-NEXT:    vzext.vf2 v12, v8
 ; RVD-NEXT:    vfwcvt.f.xu.v v16, v12
 ; RVD-NEXT:    vnsrl.wi v8, v16, 23
 ; RVD-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; RVD-NEXT:    vnsrl.wi v12, v8, 0
-; RVD-NEXT:    li a1, 134
 ; RVD-NEXT:    vrsub.vx v8, v12, a1
 ; RVD-NEXT:    li a1, 8
 ; RVD-NEXT:    vminu.vx v8, v8, a1
@@ -478,6 +478,8 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVI-NEXT:    vle16.v v8, (a0)
+; RVI-NEXT:    lui a1, 5
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 2
@@ -488,20 +490,18 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    lui a1, 5
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    lui a1, 3
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 1
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v10, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v10
-; RVI-NEXT:    lui a1, 1
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    li a1, 257
 ; RVI-NEXT:    vmul.vx v8, v8, a1
@@ -513,9 +513,9 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVF-NEXT:    vle16.v v8, (a0)
+; RVF-NEXT:    li a1, 142
 ; RVF-NEXT:    vfwcvt.f.xu.v v12, v8
 ; RVF-NEXT:    vnsrl.wi v8, v12, 23
-; RVF-NEXT:    li a1, 142
 ; RVF-NEXT:    vrsub.vx v8, v8, a1
 ; RVF-NEXT:    li a1, 16
 ; RVF-NEXT:    vminu.vx v8, v8, a1
@@ -526,9 +526,9 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVD-NEXT:    vle16.v v8, (a0)
+; RVD-NEXT:    li a1, 142
 ; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
 ; RVD-NEXT:    vnsrl.wi v8, v12, 23
-; RVD-NEXT:    li a1, 142
 ; RVD-NEXT:    vrsub.vx v8, v8, a1
 ; RVD-NEXT:    li a1, 16
 ; RVD-NEXT:    vminu.vx v8, v8, a1
@@ -555,6 +555,8 @@ define void @ctlz_v8i32(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RVI-NEXT:    vle32.v v8, (a0)
+; RVI-NEXT:    lui a1, 349525
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 2
@@ -567,20 +569,18 @@ define void @ctlz_v8i32(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    lui a1, 349525
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    lui a1, 209715
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 61681
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v10, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v10
-; RVI-NEXT:    lui a1, 61681
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    lui a1, 4112
 ; RVI-NEXT:    addi a1, a1, 257
@@ -596,8 +596,8 @@ define void @ctlz_v8i32(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vfcvt.f.xu.v v8, v8
 ; RVF-NEXT:    fsrm a1
-; RVF-NEXT:    vsrl.vi v8, v8, 23
 ; RVF-NEXT:    li a1, 158
+; RVF-NEXT:    vsrl.vi v8, v8, 23
 ; RVF-NEXT:    vrsub.vx v8, v8, a1
 ; RVF-NEXT:    li a1, 32
 ; RVF-NEXT:    vminu.vx v8, v8, a1
@@ -608,8 +608,8 @@ define void @ctlz_v8i32(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RVD-NEXT:    vle32.v v8, (a0)
-; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
 ; RVD-NEXT:    li a1, 52
+; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
 ; RVD-NEXT:    vnsrl.wx v8, v12, a1
 ; RVD-NEXT:    li a1, 1054
 ; RVD-NEXT:    vrsub.vx v8, v8, a1
@@ -638,50 +638,50 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 2
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 8
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 16
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    li a1, 32
-; RV32I-NEXT:    vsrl.vx v10, v8, a1
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
 ; RV32I-NEXT:    lui a1, 349525
 ; RV32I-NEXT:    addi a1, a1, 1365
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a1
+; RV32I-NEXT:    vmv.v.x v10, a1
+; RV32I-NEXT:    li a1, 32
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 2
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 4
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 8
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 16
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vx v12, v8, a1
 ; RV32I-NEXT:    lui a1, 209715
 ; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vand.vv v10, v12, v10
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a1
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32I-NEXT:    vmv.v.x v12, a1
 ; RV32I-NEXT:    lui a1, 4112
 ; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -692,6 +692,23 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    addiw a4, a4, 257
+; RV64I-NEXT:    slli a5, a1, 32
+; RV64I-NEXT:    add a1, a1, a5
+; RV64I-NEXT:    slli a5, a2, 32
+; RV64I-NEXT:    add a2, a2, a5
+; RV64I-NEXT:    slli a5, a3, 32
+; RV64I-NEXT:    add a3, a3, a5
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    li a5, 32
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 2
@@ -702,37 +719,20 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v10
-; RV64I-NEXT:    li a1, 32
-; RV64I-NEXT:    vsrl.vx v10, v8, a1
+; RV64I-NEXT:    vsrl.vx v10, v8, a5
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v10, v10, a1
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v10, v8, a1
+; RV64I-NEXT:    vand.vx v10, v8, a2
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a2
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a3
+; RV64I-NEXT:    vmul.vx v8, v8, a4
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -791,6 +791,7 @@ define void @ctlz_zero_undef_v16i8(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RVI-NEXT:    vle8.v v8, (a0)
+; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 2
@@ -799,10 +800,9 @@ define void @ctlz_zero_undef_v16i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    li a1, 51
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
@@ -860,6 +860,8 @@ define void @ctlz_zero_undef_v8i16(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RVI-NEXT:    vle16.v v8, (a0)
+; RVI-NEXT:    lui a1, 5
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 2
@@ -870,20 +872,18 @@ define void @ctlz_zero_undef_v8i16(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    lui a1, 5
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    lui a1, 3
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 1
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v9, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v9
-; RVI-NEXT:    lui a1, 1
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    li a1, 257
 ; RVI-NEXT:    vmul.vx v8, v8, a1
@@ -932,6 +932,8 @@ define void @ctlz_zero_undef_v4i32(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RVI-NEXT:    vle32.v v8, (a0)
+; RVI-NEXT:    lui a1, 349525
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 2
@@ -944,20 +946,18 @@ define void @ctlz_zero_undef_v4i32(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v9
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    lui a1, 349525
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    lui a1, 209715
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 61681
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v9, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v9
-; RVI-NEXT:    lui a1, 61681
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    lui a1, 4112
 ; RVI-NEXT:    addi a1, a1, 257
@@ -983,8 +983,8 @@ define void @ctlz_zero_undef_v4i32(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RVD-NEXT:    vle32.v v8, (a0)
-; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
 ; RVD-NEXT:    li a1, 52
+; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
 ; RVD-NEXT:    vnsrl.wx v8, v10, a1
 ; RVD-NEXT:    li a1, 1054
 ; RVD-NEXT:    vrsub.vx v8, v8, a1
@@ -1010,50 +1010,50 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 2
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 8
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 16
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    li a1, 32
-; RV32I-NEXT:    vsrl.vx v9, v8, a1
-; RV32I-NEXT:    vor.vv v8, v8, v9
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
 ; RV32I-NEXT:    lui a1, 349525
 ; RV32I-NEXT:    addi a1, a1, 1365
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
+; RV32I-NEXT:    vmv.v.x v9, a1
+; RV32I-NEXT:    li a1, 32
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 2
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 8
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 16
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vx v10, v8, a1
 ; RV32I-NEXT:    lui a1, 209715
 ; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    vor.vv v8, v8, v10
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vand.vv v9, v10, v9
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    lui a1, 4112
 ; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -1064,6 +1064,23 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    addiw a4, a4, 257
+; RV64I-NEXT:    slli a5, a1, 32
+; RV64I-NEXT:    add a1, a1, a5
+; RV64I-NEXT:    slli a5, a2, 32
+; RV64I-NEXT:    add a2, a2, a5
+; RV64I-NEXT:    slli a5, a3, 32
+; RV64I-NEXT:    add a3, a3, a5
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    li a5, 32
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 2
@@ -1074,37 +1091,20 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v9
-; RV64I-NEXT:    li a1, 32
-; RV64I-NEXT:    vsrl.vx v9, v8, a1
+; RV64I-NEXT:    vsrl.vx v9, v8, a5
 ; RV64I-NEXT:    vor.vv v8, v8, v9
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v9, v9, a1
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v9, v8, a1
+; RV64I-NEXT:    vand.vx v9, v8, a2
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a2
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a3
+; RV64I-NEXT:    vmul.vx v8, v8, a4
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -1158,6 +1158,7 @@ define void @ctlz_zero_undef_v32i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    li a1, 32
 ; RVI-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; RVI-NEXT:    vle8.v v8, (a0)
+; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 2
@@ -1166,10 +1167,9 @@ define void @ctlz_zero_undef_v32i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    li a1, 51
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
@@ -1230,6 +1230,8 @@ define void @ctlz_zero_undef_v16i16(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVI-NEXT:    vle16.v v8, (a0)
+; RVI-NEXT:    lui a1, 5
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 2
@@ -1240,20 +1242,18 @@ define void @ctlz_zero_undef_v16i16(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    lui a1, 5
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    lui a1, 3
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 1
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v10, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v10
-; RVI-NEXT:    lui a1, 1
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    li a1, 257
 ; RVI-NEXT:    vmul.vx v8, v8, a1
@@ -1302,6 +1302,8 @@ define void @ctlz_zero_undef_v8i32(ptr %x, ptr %y) nounwind {
 ; RVI:       # %bb.0:
 ; RVI-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RVI-NEXT:    vle32.v v8, (a0)
+; RVI-NEXT:    lui a1, 349525
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 2
@@ -1314,20 +1316,18 @@ define void @ctlz_zero_undef_v8i32(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vor.vv v8, v8, v10
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    lui a1, 349525
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    lui a1, 209715
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 61681
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v10, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v10
-; RVI-NEXT:    lui a1, 61681
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    lui a1, 4112
 ; RVI-NEXT:    addi a1, a1, 257
@@ -1353,8 +1353,8 @@ define void @ctlz_zero_undef_v8i32(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RVD-NEXT:    vle32.v v8, (a0)
-; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
 ; RVD-NEXT:    li a1, 52
+; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
 ; RVD-NEXT:    vnsrl.wx v8, v12, a1
 ; RVD-NEXT:    li a1, 1054
 ; RVD-NEXT:    vrsub.vx v8, v8, a1
@@ -1380,50 +1380,50 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 2
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 8
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 16
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    li a1, 32
-; RV32I-NEXT:    vsrl.vx v10, v8, a1
-; RV32I-NEXT:    vor.vv v8, v8, v10
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
 ; RV32I-NEXT:    lui a1, 349525
 ; RV32I-NEXT:    addi a1, a1, 1365
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a1
+; RV32I-NEXT:    vmv.v.x v10, a1
+; RV32I-NEXT:    li a1, 32
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 2
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 4
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 8
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 16
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vx v12, v8, a1
 ; RV32I-NEXT:    lui a1, 209715
 ; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    vor.vv v8, v8, v12
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vand.vv v10, v12, v10
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a1
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32I-NEXT:    vmv.v.x v12, a1
 ; RV32I-NEXT:    lui a1, 4112
 ; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -1434,6 +1434,23 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    addiw a4, a4, 257
+; RV64I-NEXT:    slli a5, a1, 32
+; RV64I-NEXT:    add a1, a1, a5
+; RV64I-NEXT:    slli a5, a2, 32
+; RV64I-NEXT:    add a2, a2, a5
+; RV64I-NEXT:    slli a5, a3, 32
+; RV64I-NEXT:    add a3, a3, a5
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    li a5, 32
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 2
@@ -1444,37 +1461,20 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 16
 ; RV64I-NEXT:    vor.vv v8, v8, v10
-; RV64I-NEXT:    li a1, 32
-; RV64I-NEXT:    vsrl.vx v10, v8, a1
+; RV64I-NEXT:    vsrl.vx v10, v8, a5
 ; RV64I-NEXT:    vor.vv v8, v8, v10
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v10, v10, a1
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v10, v8, a1
+; RV64I-NEXT:    vand.vx v10, v8, a2
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a2
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a3
+; RV64I-NEXT:    vmul.vx v8, v8, a4
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
index a8ae3389fb2a56..5e73e6df9170c2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
@@ -13,8 +13,8 @@ define <2 x i8> @vp_ctpop_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -34,8 +34,8 @@ define <2 x i8> @vp_ctpop_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -57,8 +57,8 @@ define <4 x i8> @vp_ctpop_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -78,8 +78,8 @@ define <4 x i8> @vp_ctpop_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -101,8 +101,8 @@ define <8 x i8> @vp_ctpop_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -122,8 +122,8 @@ define <8 x i8> @vp_ctpop_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -145,8 +145,8 @@ define <16 x i8> @vp_ctpop_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl)
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -166,8 +166,8 @@ define <16 x i8> @vp_ctpop_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -190,17 +190,17 @@ define <2 x i16> @vp_ctpop_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -218,17 +218,17 @@ define <2 x i16> @vp_ctpop_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -248,17 +248,17 @@ define <4 x i16> @vp_ctpop_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -276,17 +276,17 @@ define <4 x i16> @vp_ctpop_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -306,17 +306,17 @@ define <8 x i16> @vp_ctpop_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -334,17 +334,17 @@ define <8 x i16> @vp_ctpop_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -364,17 +364,17 @@ define <16 x i16> @vp_ctpop_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -392,17 +392,17 @@ define <16 x i16> @vp_ctpop_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -422,17 +422,17 @@ define <2 x i32> @vp_ctpop_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -451,17 +451,17 @@ define <2 x i32> @vp_ctpop_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -482,17 +482,17 @@ define <4 x i32> @vp_ctpop_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -511,17 +511,17 @@ define <4 x i32> @vp_ctpop_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -542,17 +542,17 @@ define <8 x i32> @vp_ctpop_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -571,17 +571,17 @@ define <8 x i32> @vp_ctpop_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -602,17 +602,17 @@ define <16 x i32> @vp_ctpop_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %ev
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -631,17 +631,17 @@ define <16 x i32> @vp_ctpop_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -663,67 +663,67 @@ define <2 x i64> @vp_ctpop_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v2i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v9, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.ctpop.v2i64(<2 x i64> %va, <2 x i1> %m, i32 %evl)
@@ -739,31 +739,31 @@ define <2 x i64> @vp_ctpop_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsub.vv v8, v8, v9
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v9, v10
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vsub.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -771,34 +771,34 @@ define <2 x i64> @vp_ctpop_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v2i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v9, a0
+; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vx v9, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -817,67 +817,67 @@ define <4 x i64> @vp_ctpop_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v4i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v10, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.ctpop.v4i64(<4 x i64> %va, <4 x i1> %m, i32 %evl)
@@ -893,31 +893,31 @@ define <4 x i64> @vp_ctpop_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v10, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -925,34 +925,34 @@ define <4 x i64> @vp_ctpop_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v4i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v10, a0
+; RV64-NEXT:    vand.vx v10, v10, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vx v10, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -971,67 +971,67 @@ define <8 x i64> @vp_ctpop_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v16, v12, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV32-NEXT:    lui a1, 61681
-; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_ctpop_v8i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v12, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.ctpop.v8i64(<8 x i64> %va, <8 x i1> %m, i32 %evl)
@@ -1047,31 +1047,31 @@ define <8 x i64> @vp_ctpop_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsub.vv v8, v8, v12
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v12, v16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsub.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1079,34 +1079,34 @@ define <8 x i64> @vp_ctpop_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v8i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v12, a0
+; RV64-NEXT:    vand.vx v12, v12, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vx v12, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1122,10 +1122,11 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
@@ -1144,66 +1145,41 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    addi a1, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
 ; RV32-NEXT:    addi sp, sp, 48
@@ -1212,35 +1188,35 @@ define <15 x i64> @vp_ctpop_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ;
 ; RV64-LABEL: vp_ctpop_v15i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.ctpop.v15i64(<15 x i64> %va, <15 x i1> %m, i32 %evl)
@@ -1265,28 +1241,31 @@ define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v0, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v0, v16, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vand.vv v0, v8, v24
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v0, v8
 ; RV32-NEXT:    vsrl.vi v0, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v0
 ; RV32-NEXT:    vand.vv v8, v8, v16
@@ -1299,34 +1278,34 @@ define <15 x i64> @vp_ctpop_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v15i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1342,10 +1321,11 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
@@ -1364,66 +1344,41 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    addi a1, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
 ; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
+; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
-; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
 ; RV32-NEXT:    addi sp, sp, 48
@@ -1432,35 +1387,35 @@ define <16 x i64> @vp_ctpop_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ;
 ; RV64-LABEL: vp_ctpop_v16i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.ctpop.v16i64(<16 x i64> %va, <16 x i1> %m, i32 %evl)
@@ -1485,28 +1440,31 @@ define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v0, (a1), zero
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v0, v16, v0
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
+; RV32-NEXT:    vsub.vv v8, v8, v0
+; RV32-NEXT:    vand.vv v0, v8, v24
 ; RV32-NEXT:    vsrl.vi v8, v8, 2
 ; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    addi a1, sp, 8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
-; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v8, v0, v8
 ; RV32-NEXT:    vsrl.vi v0, v8, 4
 ; RV32-NEXT:    vadd.vv v8, v8, v0
 ; RV32-NEXT:    vand.vv v8, v8, v16
@@ -1519,34 +1477,34 @@ define <16 x i64> @vp_ctpop_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ;
 ; RV64-LABEL: vp_ctpop_v16i64_unmasked:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a0
+; RV64-NEXT:    vand.vx v16, v16, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1567,111 +1525,110 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v7, v0, 2
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    li a3, 16
+; RV32-NEXT:    addi a1, a2, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB34_2
+; RV32-NEXT:    bltu a0, a3, .LBB34_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB34_2:
 ; RV32-NEXT:    addi a2, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a2), zero
+; RV32-NEXT:    vlse64.v v16, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a2, sp, 32
 ; RV32-NEXT:    vlse64.v v16, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 40
-; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a2, sp, 24
 ; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a2), zero
+; RV32-NEXT:    vlse64.v v16, (a2), zero
 ; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv8r.v v8, v16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v16, 4, v0.t
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
@@ -1690,36 +1647,36 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    and a0, a0, a2
 ; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    slli a2, a2, 4
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a2, 40
+; RV32-NEXT:    li a2, 24
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
@@ -1764,58 +1721,58 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
+; RV64-NEXT:    addiw a1, a1, 1365
 ; RV64-NEXT:    addiw a2, a2, 819
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a5, a1, a5
+; RV64-NEXT:    slli a1, a2, 32
+; RV64-NEXT:    add a6, a2, a1
+; RV64-NEXT:    slli a1, a3, 32
+; RV64-NEXT:    add a1, a3, a1
+; RV64-NEXT:    slli a2, a4, 32
+; RV64-NEXT:    add a2, a4, a2
+; RV64-NEXT:    addi a3, a0, -16
+; RV64-NEXT:    sltu a0, a0, a3
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a3
+; RV64-NEXT:    li a3, 56
+; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a6, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    addiw a3, a3, -241
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addiw a4, a4, 257
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    li a5, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a5, v0.t
-; RV64-NEXT:    addi a6, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a6, a0, -16
-; RV64-NEXT:    sltu a0, a0, a6
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a6
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a3, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a6, vlenb
-; RV64-NEXT:    slli a6, a6, 3
-; RV64-NEXT:    add a6, sp, a6
-; RV64-NEXT:    addi a6, a6, 16
-; RV64-NEXT:    vl8r.v v8, (a6) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    vsub.vv v16, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v8, v16, a6, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a3, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a0, vlenb
@@ -1835,88 +1792,141 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
+; RV32-NEXT:    vmv8r.v v24, v16
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    li a3, 16
+; RV32-NEXT:    addi a1, a2, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB35_2
+; RV32-NEXT:    bltu a0, a3, .LBB35_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB35_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 1
 ; RV32-NEXT:    addi a2, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a2), zero
-; RV32-NEXT:    addi a2, sp, 32
-; RV32-NEXT:    vlse64.v v24, (a2), zero
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v0, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v0, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v0, v8
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a2, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a2
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    addi a2, sp, 32
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v0, v0, v16
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v0, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv8r.v v8, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    addi a2, sp, 24
-; RV32-NEXT:    vsub.vv v16, v8, v16
-; RV32-NEXT:    vand.vv v0, v16, v24
+; RV32-NEXT:    vsrl.vi v24, v24, 1
+; RV32-NEXT:    vand.vv v16, v24, v16
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v16, v24, v16
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v8, v24
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v16, v0
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsrl.vi v16, v16, 2
-; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v8, v0
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v0, v8, v0
+; RV32-NEXT:    addi a2, sp, 24
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vadd.vv v16, v8, v16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a2), zero
+; RV32-NEXT:    vlse64.v v8, (a2), zero
 ; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vadd.vv v24, v24, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v0, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 4
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vsrl.vi v16, v24, 4
+; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vand.vv v24, v24, v8
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v24
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v16, v0
+; RV32-NEXT:    vmul.vv v16, v24, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vmul.vv v24, v8, v0
 ; RV32-NEXT:    li a2, 56
@@ -1925,7 +1935,8 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v16, v24, a2
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
 ; RV32-NEXT:    addi sp, sp, 48
@@ -1942,51 +1953,61 @@ define <32 x i64> @vp_ctpop_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:  .LBB35_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v24, v24, a1
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    lui a2, 209715
-; RV64-NEXT:    addiw a2, a2, 819
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v24, v8, a2
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a2
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    lui a3, 61681
-; RV64-NEXT:    addiw a3, a3, -241
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    lui a4, 4112
-; RV64-NEXT:    addiw a4, a4, 257
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vmul.vx v8, v8, a4
-; RV64-NEXT:    li a5, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a5
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    addiw a3, a3, 819
+; RV64-NEXT:    addiw a4, a4, -241
+; RV64-NEXT:    addiw a5, a5, 257
+; RV64-NEXT:    slli a6, a2, 32
+; RV64-NEXT:    add a2, a2, a6
+; RV64-NEXT:    slli a6, a3, 32
+; RV64-NEXT:    add a3, a3, a6
+; RV64-NEXT:    slli a6, a4, 32
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    slli a6, a5, 32
+; RV64-NEXT:    add a5, a5, a6
 ; RV64-NEXT:    addi a6, a0, -16
 ; RV64-NEXT:    sltu a0, a0, a6
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    and a0, a0, a6
+; RV64-NEXT:    li a6, 56
+; RV64-NEXT:    vand.vx v24, v24, a2
+; RV64-NEXT:    vsub.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v24, v24, a2
 ; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a2
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v8, a3
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vadd.vv v8, v24, v8
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v16, a3
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a2
+; RV64-NEXT:    vand.vx v16, v16, a3
 ; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v8, v8, a4
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a3
-; RV64-NEXT:    vmul.vx v16, v16, a4
-; RV64-NEXT:    vsrl.vx v16, v16, a5
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v8, v8, a5
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v16, v16, a4
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v8, v8, a6
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v16, v16, a5
+; RV64-NEXT:    vsrl.vx v16, v16, a6
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctpop.v32i64(<32 x i64> %va, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
index b5114bbe491896..4fbe67cfcd642a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll
@@ -9,11 +9,11 @@ define void @ctpop_v16i8(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    li a1, 85
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a1, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a1
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a1
@@ -44,21 +44,21 @@ define void @ctpop_v8i16(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a1, 5
 ; CHECK-NEXT:    addi a1, a1, 1365
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a1, 3
 ; CHECK-NEXT:    addi a1, a1, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a1
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    lui a1, 1
+; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a1
@@ -86,21 +86,21 @@ define void @ctpop_v4i32(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    lui a1, 349525
 ; CHECK-NEXT:    addi a1, a1, 1365
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a1
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a1, 209715
 ; CHECK-NEXT:    addi a1, a1, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a1
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    lui a1, 61681
+; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a1, 61681
-; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    lui a1, 4112
 ; CHECK-NEXT:    addi a1, a1, 257
@@ -133,32 +133,32 @@ define void @ctpop_v2i64(ptr %x, ptr %y) {
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vi v10, v8, 1
 ; RV32-NEXT:    vand.vv v9, v10, v9
-; RV32-NEXT:    vsub.vv v8, v8, v9
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vsub.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
@@ -169,33 +169,33 @@ define void @ctpop_v2i64(ptr %x, ptr %y) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    vsrl.vi v9, v8, 1
 ; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
 ; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    vsrl.vi v9, v8, 1
 ; RV64-NEXT:    vand.vx v9, v9, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    addiw a1, a1, 819
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v9, v8, a1
+; RV64-NEXT:    vand.vx v9, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a1, 61681
-; RV64-NEXT:    addiw a1, a1, -241
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    lui a1, 4112
-; RV64-NEXT:    addiw a1, a1, 257
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vmul.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a1, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a1
 ; RV64-NEXT:    vse64.v v8, (a0)
@@ -222,11 +222,11 @@ define void @ctpop_v32i8(ptr %x, ptr %y) {
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    li a1, 85
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a1
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    li a1, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a1
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a1
@@ -258,21 +258,21 @@ define void @ctpop_v16i16(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    lui a1, 5
 ; CHECK-NEXT:    addi a1, a1, 1365
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a1
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a1, 3
 ; CHECK-NEXT:    addi a1, a1, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a1
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    lui a1, 1
+; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    li a1, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a1
@@ -300,21 +300,21 @@ define void @ctpop_v8i32(ptr %x, ptr %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    lui a1, 349525
 ; CHECK-NEXT:    addi a1, a1, 1365
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a1
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a1, 209715
 ; CHECK-NEXT:    addi a1, a1, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a1
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    lui a1, 61681
+; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a1, 61681
-; CHECK-NEXT:    addi a1, a1, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    lui a1, 4112
 ; CHECK-NEXT:    addi a1, a1, 257
@@ -439,32 +439,32 @@ define void @ctpop_v4i64(ptr %x, ptr %y) {
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vi v12, v8, 1
 ; RV32-NEXT:    vand.vv v10, v12, v10
-; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
@@ -475,33 +475,33 @@ define void @ctpop_v4i64(ptr %x, ptr %y) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    vsrl.vi v10, v8, 1
 ; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 209715
+; RV64-NEXT:    lui a3, 61681
+; RV64-NEXT:    lui a4, 4112
 ; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    addiw a2, a2, 819
+; RV64-NEXT:    addiw a3, a3, -241
+; RV64-NEXT:    addiw a4, a4, 257
+; RV64-NEXT:    slli a5, a1, 32
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    vsrl.vi v10, v8, 1
 ; RV64-NEXT:    vand.vx v10, v10, a1
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a1, 209715
-; RV64-NEXT:    addiw a1, a1, 819
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v10, v8, a1
+; RV64-NEXT:    vand.vx v10, v8, a2
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v8, v8, a2
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a1, 61681
-; RV64-NEXT:    addiw a1, a1, -241
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vand.vx v8, v8, a1
-; RV64-NEXT:    lui a1, 4112
-; RV64-NEXT:    addiw a1, a1, 257
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    vmul.vx v8, v8, a1
+; RV64-NEXT:    vand.vx v8, v8, a3
+; RV64-NEXT:    vmul.vx v8, v8, a4
 ; RV64-NEXT:    li a1, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a1
 ; RV64-NEXT:    vse64.v v8, (a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
index d4c4ea7fee184f..cd4b19f11d1602 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
@@ -12,13 +12,13 @@ define <2 x i8> @vp_cttz_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -36,14 +36,14 @@ define <2 x i8> @vp_cttz_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -64,13 +64,13 @@ define <4 x i8> @vp_cttz_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -88,14 +88,14 @@ define <4 x i8> @vp_cttz_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -116,13 +116,13 @@ define <8 x i8> @vp_cttz_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -140,14 +140,14 @@ define <8 x i8> @vp_cttz_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -168,13 +168,13 @@ define <16 x i8> @vp_cttz_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -192,14 +192,14 @@ define <16 x i8> @vp_cttz_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -220,23 +220,23 @@ define <2 x i16> @vp_cttz_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -251,24 +251,24 @@ define <2 x i16> @vp_cttz_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -286,23 +286,23 @@ define <4 x i16> @vp_cttz_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -317,24 +317,24 @@ define <4 x i16> @vp_cttz_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -352,23 +352,23 @@ define <8 x i16> @vp_cttz_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -383,24 +383,24 @@ define <8 x i16> @vp_cttz_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -418,23 +418,23 @@ define <16 x i16> @vp_cttz_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsub.vx v10, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -449,24 +449,24 @@ define <16 x i16> @vp_cttz_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsub.vx v10, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    vnot.v v10, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -484,23 +484,23 @@ define <2 x i32> @vp_cttz_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -516,24 +516,24 @@ define <2 x i32> @vp_cttz_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -552,23 +552,23 @@ define <4 x i32> @vp_cttz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -584,24 +584,24 @@ define <4 x i32> @vp_cttz_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -620,23 +620,23 @@ define <8 x i32> @vp_cttz_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsub.vx v10, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -652,24 +652,24 @@ define <8 x i32> @vp_cttz_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsub.vx v10, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    vnot.v v10, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -688,23 +688,23 @@ define <16 x i32> @vp_cttz_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsub.vx v12, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -720,24 +720,24 @@ define <16 x i32> @vp_cttz_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsub.vx v12, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vnot.v v12, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -756,78 +756,78 @@ define <2 x i64> @vp_cttz_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsub.vx v9, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsub.vx v9, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64> %va, i1 false, <2 x i1> %m, i32 %evl)
@@ -839,39 +839,39 @@ define <2 x i64> @vp_cttz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vx v9, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    vnot.v v9, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsub.vv v8, v8, v9
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    vand.vv v9, v9, v10
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vsub.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -880,37 +880,37 @@ define <2 x i64> @vp_cttz_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
 ; RV64-LABEL: vp_cttz_v2i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsub.vx v9, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vx v9, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -926,78 +926,78 @@ define <4 x i64> @vp_cttz_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsub.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsub.vx v10, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64> %va, i1 false, <4 x i1> %m, i32 %evl)
@@ -1009,39 +1009,39 @@ define <4 x i64> @vp_cttz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vx v10, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    vnot.v v10, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    vand.vv v10, v10, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1050,37 +1050,37 @@ define <4 x i64> @vp_cttz_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
 ; RV64-LABEL: vp_cttz_v4i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsub.vx v10, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vx v10, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1096,78 +1096,78 @@ define <8 x i64> @vp_cttz_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsub.vx v12, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vand.vv v12, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v12, 1, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsub.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v12, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_v8i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsub.vx v12, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64> %va, i1 false, <8 x i1> %m, i32 %evl)
@@ -1179,39 +1179,39 @@ define <8 x i64> @vp_cttz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vx v12, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vnot.v v12, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsub.vv v8, v8, v12
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v12, v12, v16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsub.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1220,37 +1220,37 @@ define <8 x i64> @vp_cttz_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
 ; RV64-LABEL: vp_cttz_v8i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsub.vx v12, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v12
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vx v12, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1269,6 +1269,9 @@ define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
@@ -1285,59 +1288,60 @@ define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -1350,38 +1354,38 @@ define <15 x i64> @vp_cttz_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl
 ; RV64-LABEL: vp_cttz_v15i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 false, <15 x i1> %m, i32 %evl)
@@ -1393,6 +1397,9 @@ define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -32
 ; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 24(sp)
@@ -1406,36 +1413,35 @@ define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v0, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v16, 1
+; RV32-NEXT:    vand.vv v0, v8, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v0, v16, v24
+; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1446,37 +1452,37 @@ define <15 x i64> @vp_cttz_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
 ; RV64-LABEL: vp_cttz_v15i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1495,6 +1501,9 @@ define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
@@ -1511,59 +1520,60 @@ define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -1576,38 +1586,38 @@ define <16 x i64> @vp_cttz_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl
 ; RV64-LABEL: vp_cttz_v16i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> %va, i1 false, <16 x i1> %m, i32 %evl)
@@ -1619,6 +1629,9 @@ define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -32
 ; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 24(sp)
@@ -1632,36 +1645,35 @@ define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v0, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v16, 1
+; RV32-NEXT:    vand.vv v0, v8, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v0, v16, v24
+; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -1672,37 +1684,37 @@ define <16 x i64> @vp_cttz_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
 ; RV64-LABEL: vp_cttz_v16i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -1718,50 +1730,51 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v0, 2
+; RV32-NEXT:    vslidedown.vi v7, v0, 2
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:    addi a2, a2, 257
+; RV32-NEXT:    sw a2, 16(sp)
+; RV32-NEXT:    sw a2, 20(sp)
 ; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a3, .LBB34_2
+; RV32-NEXT:    bltu a0, a1, .LBB34_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB34_2:
 ; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    addi a3, sp, 40
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 40
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    mul a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
@@ -1773,72 +1786,40 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a3, sp, 32
 ; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
@@ -1855,16 +1836,13 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a3, sp, 24
 ; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a3, a3, a5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    addi a3, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vlse64.v v8, (a4), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -1874,23 +1852,15 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
@@ -1903,7 +1873,7 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
@@ -1913,84 +1883,49 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
 ; RV32-NEXT:    vnot.v v16, v16, v0.t
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsub.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v24, 2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -2002,7 +1937,7 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
@@ -2026,73 +1961,73 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    mv a2, a0
+; RV64-NEXT:    mv a4, a0
 ; RV64-NEXT:    bltu a0, a1, .LBB34_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    li a4, 16
 ; RV64-NEXT:  .LBB34_2:
 ; RV64-NEXT:    li a1, 1
-; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    addiw a3, a3, 819
+; RV64-NEXT:    addiw a7, a5, -241
+; RV64-NEXT:    addiw t0, a6, 257
+; RV64-NEXT:    slli a6, a2, 32
+; RV64-NEXT:    add a6, a2, a6
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a5, a3, a5
+; RV64-NEXT:    slli a2, a7, 32
+; RV64-NEXT:    add a2, a7, a2
+; RV64-NEXT:    slli a3, t0, 32
+; RV64-NEXT:    add a3, t0, a3
+; RV64-NEXT:    addi a7, a0, -16
+; RV64-NEXT:    sltu a0, a0, a7
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a7, a0, a7
+; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a5, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT:    addi a7, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a7, vlenb
-; RV64-NEXT:    slli a7, a7, 3
-; RV64-NEXT:    add a7, sp, a7
-; RV64-NEXT:    addi a7, a7, 16
-; RV64-NEXT:    vl8r.v v8, (a7) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a5, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a0, vlenb
@@ -2112,105 +2047,102 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    li a3, 16
+; RV32-NEXT:    addi a1, a2, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB35_2
+; RV32-NEXT:    bltu a0, a3, .LBB35_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB35_2:
 ; RV32-NEXT:    li a2, 1
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v24, v8, a2
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v0, v8, v24
+; RV32-NEXT:    vnot.v v0, v8
 ; RV32-NEXT:    addi a3, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v0, 1
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vsub.vv v24, v0, v24
-; RV32-NEXT:    vand.vv v0, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v24, v24, v8
-; RV32-NEXT:    vadd.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 4
-; RV32-NEXT:    vadd.vv v24, v24, v0
-; RV32-NEXT:    addi a3, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v24, (a3), zero
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v0, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v8, v8, a2
+; RV32-NEXT:    vand.vv v8, v0, v8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v0, v16, a2
+; RV32-NEXT:    vnot.v v16, v16
+; RV32-NEXT:    vand.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    vand.vv v0, v0, v24
+; RV32-NEXT:    vsub.vv v0, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v24, v0, a2
-; RV32-NEXT:    vnot.v v0, v0
-; RV32-NEXT:    vand.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v8, v16, 1
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    addi a2, sp, 24
-; RV32-NEXT:    vsub.vv v16, v24, v16
-; RV32-NEXT:    vand.vv v24, v16, v8
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v16, v16, v24
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v0, v8
+; RV32-NEXT:    vsrl.vi v0, v0, 2
+; RV32-NEXT:    vand.vv v0, v0, v8
+; RV32-NEXT:    vadd.vv v24, v24, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v0, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v16, 2
 ; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v24, 4
+; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    addi a4, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a2), zero
-; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vlse64.v v24, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vadd.vv v8, v0, v8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a2), zero
+; RV32-NEXT:    vlse64.v v0, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
 ; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v0, v16
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v0, v24
+; RV32-NEXT:    vmul.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v24, v8, v24
+; RV32-NEXT:    vmul.vv v24, v8, v0
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v16, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v16, v24, a2
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
 ; RV32-NEXT:    addi sp, sp, 48
@@ -2227,58 +2159,68 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
 ; RV64-NEXT:  .LBB35_2:
 ; RV64-NEXT:    li a2, 1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v24, v8, a2
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v24
+; RV64-NEXT:    vnot.v v24, v8
+; RV64-NEXT:    lui a3, 349525
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    addiw a3, a3, 1365
+; RV64-NEXT:    addiw a4, a4, 819
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw a6, a6, 257
+; RV64-NEXT:    slli a7, a3, 32
+; RV64-NEXT:    add a3, a3, a7
+; RV64-NEXT:    slli a7, a4, 32
+; RV64-NEXT:    add a4, a4, a7
+; RV64-NEXT:    slli a7, a5, 32
+; RV64-NEXT:    add a5, a5, a7
+; RV64-NEXT:    slli a7, a6, 32
+; RV64-NEXT:    add a6, a6, a7
+; RV64-NEXT:    addi a7, a0, -16
+; RV64-NEXT:    sltu a0, a0, a7
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a7
+; RV64-NEXT:    li a7, 56
+; RV64-NEXT:    vsub.vx v8, v8, a2
+; RV64-NEXT:    vand.vv v8, v24, v8
 ; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a3, a1, 32
-; RV64-NEXT:    add a1, a1, a3
-; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v24, v24, a3
 ; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v24, v16, a2
 ; RV64-NEXT:    vnot.v v16, v16
 ; RV64-NEXT:    vand.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v8, a4
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a4
+; RV64-NEXT:    vadd.vv v8, v24, v8
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v24, v24, a3
 ; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a3
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v16, a4
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vand.vx v16, v16, a4
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v8, v8, a5
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v8, v8, a6
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v8, v8, a7
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vmul.vx v16, v16, a5
-; RV64-NEXT:    vsrl.vx v16, v16, a6
+; RV64-NEXT:    vand.vx v16, v16, a5
+; RV64-NEXT:    vmul.vx v16, v16, a6
+; RV64-NEXT:    vsrl.vx v16, v16, a7
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64> %va, i1 false, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v
@@ -2290,13 +2232,13 @@ define <2 x i8> @vp_cttz_zero_undef_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -2314,14 +2256,14 @@ define <2 x i8> @vp_cttz_zero_undef_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2340,13 +2282,13 @@ define <4 x i8> @vp_cttz_zero_undef_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -2364,14 +2306,14 @@ define <4 x i8> @vp_cttz_zero_undef_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2390,13 +2332,13 @@ define <8 x i8> @vp_cttz_zero_undef_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -2414,14 +2356,14 @@ define <8 x i8> @vp_cttz_zero_undef_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2440,13 +2382,13 @@ define <16 x i8> @vp_cttz_zero_undef_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zero
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    li a0, 85
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
@@ -2464,14 +2406,14 @@ define <16 x i8> @vp_cttz_zero_undef_v16i8_unmasked(<16 x i8> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    li a0, 85
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    li a0, 51
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
@@ -2490,23 +2432,23 @@ define <2 x i16> @vp_cttz_zero_undef_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -2521,24 +2463,24 @@ define <2 x i16> @vp_cttz_zero_undef_v2i16_unmasked(<2 x i16> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -2554,23 +2496,23 @@ define <4 x i16> @vp_cttz_zero_undef_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -2585,24 +2527,24 @@ define <4 x i16> @vp_cttz_zero_undef_v4i16_unmasked(<4 x i16> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -2618,23 +2560,23 @@ define <8 x i16> @vp_cttz_zero_undef_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -2649,24 +2591,24 @@ define <8 x i16> @vp_cttz_zero_undef_v8i16_unmasked(<8 x i16> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -2682,23 +2624,23 @@ define <16 x i16> @vp_cttz_zero_undef_v16i16(<16 x i16> %va, <16 x i1> %m, i32 z
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vsub.vx v10, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 5
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
@@ -2713,24 +2655,24 @@ define <16 x i16> @vp_cttz_zero_undef_v16i16_unmasked(<16 x i16> %va, i32 zeroex
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vsub.vx v10, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    vnot.v v10, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 3
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 1
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    li a0, 257
 ; CHECK-NEXT:    vmul.vx v8, v8, a0
@@ -2746,23 +2688,23 @@ define <2 x i32> @vp_cttz_zero_undef_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -2778,24 +2720,24 @@ define <2 x i32> @vp_cttz_zero_undef_v2i32_unmasked(<2 x i32> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -2812,23 +2754,23 @@ define <4 x i32> @vp_cttz_zero_undef_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vsub.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vx v9, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -2844,24 +2786,24 @@ define <4 x i32> @vp_cttz_zero_undef_v4i32_unmasked(<4 x i32> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vsub.vx v9, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v9
-; CHECK-NEXT:    vsrl.vi v9, v8, 1
+; CHECK-NEXT:    vnot.v v9, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v9, v8
+; CHECK-NEXT:    vsrl.vi v9, v8, 1
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v9
 ; CHECK-NEXT:    vand.vx v9, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsrl.vi v9, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -2878,23 +2820,23 @@ define <8 x i32> @vp_cttz_zero_undef_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vsub.vx v10, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -2910,24 +2852,24 @@ define <8 x i32> @vp_cttz_zero_undef_v8i32_unmasked(<8 x i32> %va, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vsub.vx v10, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 1
+; CHECK-NEXT:    vnot.v v10, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v10, v8
+; CHECK-NEXT:    vsrl.vi v10, v8, 1
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v10
 ; CHECK-NEXT:    vand.vx v10, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -2944,23 +2886,23 @@ define <16 x i32> @vp_cttz_zero_undef_v16i32(<16 x i32> %va, <16 x i1> %m, i32 z
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vsub.vx v12, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
 ; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
-; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; CHECK-NEXT:    vadd.vv v8, v8, v12, v0.t
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -2976,24 +2918,24 @@ define <16 x i32> @vp_cttz_zero_undef_v16i32_unmasked(<16 x i32> %va, i32 zeroex
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vsub.vx v12, v8, a1
-; CHECK-NEXT:    vnot.v v8, v8
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vsrl.vi v12, v8, 1
+; CHECK-NEXT:    vnot.v v12, v8
+; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    lui a0, 349525
 ; CHECK-NEXT:    addi a0, a0, 1365
+; CHECK-NEXT:    vand.vv v8, v12, v8
+; CHECK-NEXT:    vsrl.vi v12, v8, 1
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    lui a0, 209715
 ; CHECK-NEXT:    addi a0, a0, 819
+; CHECK-NEXT:    vsub.vv v8, v8, v12
 ; CHECK-NEXT:    vand.vx v12, v8, a0
 ; CHECK-NEXT:    vsrl.vi v8, v8, 2
 ; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    lui a0, 61681
+; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vadd.vv v8, v12, v8
 ; CHECK-NEXT:    vsrl.vi v12, v8, 4
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
 ; CHECK-NEXT:    vand.vx v8, v8, a0
 ; CHECK-NEXT:    lui a0, 4112
 ; CHECK-NEXT:    addi a0, a0, 257
@@ -3010,78 +2952,78 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroe
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV32-NEXT:    vsub.vx v9, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
-; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v9, v9, v10, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v9, v0.t
+; RV32-NEXT:    vand.vv v9, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8, v0.t
+; RV32-NEXT:    vsrl.vi v9, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v9, v0.t
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v9, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsub.vx v9, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v9, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v9, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v9, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v9, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <2 x i64> @llvm.vp.cttz.v2i64(<2 x i64> %va, i1 true, <2 x i1> %m, i32 %evl)
@@ -3093,39 +3035,39 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vsub.vx v9, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    vnot.v v9, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v9, v9, v10
-; RV32-NEXT:    vsub.vv v8, v8, v9
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v10, v8, v9
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vadd.vv v8, v10, v8
-; RV32-NEXT:    vsrl.vi v9, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 1
+; RV32-NEXT:    vand.vv v9, v9, v10
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v9
+; RV32-NEXT:    vsub.vv v8, v8, v9
+; RV32-NEXT:    vand.vv v9, v8, v10
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v9, v8
+; RV32-NEXT:    vsrl.vi v9, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v9, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v10
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3134,37 +3076,37 @@ define <2 x i64> @vp_cttz_zero_undef_v2i64_unmasked(<2 x i64> %va, i32 zeroext %
 ; RV64-LABEL: vp_cttz_zero_undef_v2i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
 ; RV64-NEXT:    vsub.vx v9, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v9
 ; RV64-NEXT:    vsrl.vi v9, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v9, v8, a0
+; RV64-NEXT:    vand.vx v9, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vsrl.vi v9, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3178,78 +3120,78 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroe
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV32-NEXT:    vsub.vx v10, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    addi a1, a1, 819
 ; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
-; RV32-NEXT:    vadd.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 1, v0.t
+; RV32-NEXT:    vand.vv v10, v10, v12, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    vand.vv v10, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8, v0.t
+; RV32-NEXT:    vsrl.vi v10, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    vmul.vv v8, v8, v10, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v4i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsub.vx v10, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v10, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v10, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v10, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v10, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <4 x i64> @llvm.vp.cttz.v4i64(<4 x i64> %va, i1 true, <4 x i1> %m, i32 %evl)
@@ -3261,39 +3203,39 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vsub.vx v10, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    vnot.v v10, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v10, v10, v12
-; RV32-NEXT:    vsub.vv v8, v8, v10
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v12, v8, v10
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v10
-; RV32-NEXT:    vadd.vv v8, v12, v8
-; RV32-NEXT:    vsrl.vi v10, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 1
+; RV32-NEXT:    vand.vv v10, v10, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v10
+; RV32-NEXT:    vsub.vv v8, v8, v10
+; RV32-NEXT:    vand.vv v10, v8, v12
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v10, v8
+; RV32-NEXT:    vsrl.vi v10, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v12
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3302,37 +3244,37 @@ define <4 x i64> @vp_cttz_zero_undef_v4i64_unmasked(<4 x i64> %va, i32 zeroext %
 ; RV64-LABEL: vp_cttz_zero_undef_v4i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; RV64-NEXT:    vsub.vx v10, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v10
 ; RV64-NEXT:    vsrl.vi v10, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v10, v8, a0
+; RV64-NEXT:    vand.vx v10, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vsrl.vi v10, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3346,78 +3288,78 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroe
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV32-NEXT:    vsub.vx v12, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16, v0.t
-; RV32-NEXT:    vsub.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 209715
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    addi a1, a1, 819
+; RV32-NEXT:    vand.vv v12, v8, v12, v0.t
+; RV32-NEXT:    vsrl.vi v8, v12, 1, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v12, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vsub.vv v12, v12, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v12, v8, v0.t
+; RV32-NEXT:    vsrl.vi v12, v12, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v12, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v12, v0.t
-; RV32-NEXT:    li a0, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v8i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsub.vx v12, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v12, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v12, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v12, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v12, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <8 x i64> @llvm.vp.cttz.v8i64(<8 x i64> %va, i1 true, <8 x i1> %m, i32 %evl)
@@ -3429,39 +3371,39 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    li a1, 1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vsub.vx v12, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vnot.v v12, v8
+; RV32-NEXT:    vsub.vx v8, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v16, a1
-; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v12, v12, v16
-; RV32-NEXT:    vsub.vv v8, v8, v12
 ; RV32-NEXT:    lui a1, 209715
 ; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v16, v8, v12
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v12
-; RV32-NEXT:    vadd.vv v8, v16, v8
-; RV32-NEXT:    vsrl.vi v12, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 1
+; RV32-NEXT:    vand.vv v12, v12, v16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 61681
 ; RV32-NEXT:    addi a1, a1, -241
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v12
+; RV32-NEXT:    vsub.vv v8, v8, v12
+; RV32-NEXT:    vand.vv v12, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v8, 2
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vmv.v.x v16, a1
 ; RV32-NEXT:    lui a1, 4112
 ; RV32-NEXT:    addi a1, a1, 257
+; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vadd.vv v8, v12, v8
+; RV32-NEXT:    vsrl.vi v12, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v12, a1
 ; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3470,37 +3412,37 @@ define <8 x i64> @vp_cttz_zero_undef_v8i64_unmasked(<8 x i64> %va, i32 zeroext %
 ; RV64-LABEL: vp_cttz_zero_undef_v8i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; RV64-NEXT:    vsub.vx v12, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v12
 ; RV64-NEXT:    vsrl.vi v12, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v12, v8, a0
+; RV64-NEXT:    vand.vx v12, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v12, v8
 ; RV64-NEXT:    vsrl.vi v12, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3517,6 +3459,9 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
@@ -3533,59 +3478,60 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -3598,38 +3544,38 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64(<15 x i64> %va, <15 x i1> %m, i32 z
 ; RV64-LABEL: vp_cttz_zero_undef_v15i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <15 x i64> @llvm.vp.cttz.v15i64(<15 x i64> %va, i1 true, <15 x i1> %m, i32 %evl)
@@ -3641,6 +3587,9 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -32
 ; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 24(sp)
@@ -3654,36 +3603,35 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v0, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v16, 1
+; RV32-NEXT:    vand.vv v0, v8, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v0, v16, v24
+; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3694,37 +3642,37 @@ define <15 x i64> @vp_cttz_zero_undef_v15i64_unmasked(<15 x i64> %va, i32 zeroex
 ; RV64-LABEL: vp_cttz_zero_undef_v15i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3741,6 +3689,9 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
@@ -3757,59 +3708,60 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a1), zero
+; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vlse64.v v16, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v24, 1, v0.t
+; RV32-NEXT:    vnot.v v8, v8, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a1), zero
+; RV32-NEXT:    addi a1, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    addi a1, sp, 24
+; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v8, v24, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
 ; RV32-NEXT:    li a0, 56
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 48
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
@@ -3822,38 +3774,38 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64(<16 x i64> %va, <16 x i1> %m, i32 z
 ; RV64-LABEL: vp_cttz_zero_undef_v16i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <16 x i64> @llvm.vp.cttz.v16i64(<16 x i64> %va, i1 true, <16 x i1> %m, i32 %evl)
@@ -3865,6 +3817,9 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -32
 ; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v16, v8, a1
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 24(sp)
@@ -3878,36 +3833,35 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a1, 12(sp)
 ; RV32-NEXT:    lui a1, 4112
+; RV32-NEXT:    vnot.v v8, v8
 ; RV32-NEXT:    addi a1, a1, 257
 ; RV32-NEXT:    sw a1, 0(sp)
 ; RV32-NEXT:    sw a1, 4(sp)
-; RV32-NEXT:    li a1, 1
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a1
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v0, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
-; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
-; RV32-NEXT:    vsub.vv v8, v8, v16
-; RV32-NEXT:    vand.vv v16, v8, v24
-; RV32-NEXT:    vsrl.vi v8, v8, 2
-; RV32-NEXT:    vand.vv v8, v8, v24
-; RV32-NEXT:    vadd.vv v8, v16, v8
 ; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v16, v8, v16
+; RV32-NEXT:    vsrl.vi v8, v16, 1
+; RV32-NEXT:    vand.vv v0, v8, v0
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a1), zero
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v0, v16, v24
+; RV32-NEXT:    vsrl.vi v16, v16, 2
+; RV32-NEXT:    vand.vv v16, v16, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vadd.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v0, v16, 4
+; RV32-NEXT:    vadd.vv v16, v16, v0
+; RV32-NEXT:    vand.vv v8, v16, v8
 ; RV32-NEXT:    vmul.vv v8, v8, v24
 ; RV32-NEXT:    li a0, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a0
@@ -3918,37 +3872,37 @@ define <16 x i64> @vp_cttz_zero_undef_v16i64_unmasked(<16 x i64> %va, i32 zeroex
 ; RV64-LABEL: vp_cttz_zero_undef_v16i64_unmasked:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a4, 61681
+; RV64-NEXT:    lui a5, 4112
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1
+; RV64-NEXT:    addiw a0, a2, 1365
+; RV64-NEXT:    addiw a1, a3, 819
+; RV64-NEXT:    addiw a2, a4, -241
+; RV64-NEXT:    addiw a3, a5, 257
+; RV64-NEXT:    slli a4, a0, 32
+; RV64-NEXT:    add a0, a0, a4
+; RV64-NEXT:    slli a4, a1, 32
+; RV64-NEXT:    add a1, a1, a4
+; RV64-NEXT:    slli a4, a2, 32
+; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    vnot.v v8, v8
 ; RV64-NEXT:    vand.vv v8, v8, v16
 ; RV64-NEXT:    vsrl.vi v16, v8, 1
-; RV64-NEXT:    lui a0, 349525
-; RV64-NEXT:    addiw a0, a0, 1365
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vand.vx v16, v16, a0
 ; RV64-NEXT:    vsub.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 209715
-; RV64-NEXT:    addiw a0, a0, 819
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v8, a0
+; RV64-NEXT:    vand.vx v16, v8, a1
 ; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vv v8, v16, v8
 ; RV64-NEXT:    vsrl.vi v16, v8, 4
 ; RV64-NEXT:    vadd.vv v8, v8, v16
-; RV64-NEXT:    lui a0, 61681
-; RV64-NEXT:    addiw a0, a0, -241
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vand.vx v8, v8, a0
-; RV64-NEXT:    lui a0, 4112
-; RV64-NEXT:    addiw a0, a0, 257
-; RV64-NEXT:    slli a1, a0, 32
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    vand.vx v8, v8, a2
+; RV64-NEXT:    vmul.vx v8, v8, a3
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsrl.vx v8, v8, a0
 ; RV64-NEXT:    ret
@@ -3962,50 +3916,51 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v0, 2
+; RV32-NEXT:    vslidedown.vi v7, v0, 2
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a3, 16
-; RV32-NEXT:    sw a1, 16(sp)
-; RV32-NEXT:    sw a1, 20(sp)
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:    addi a2, a2, 257
+; RV32-NEXT:    sw a2, 16(sp)
+; RV32-NEXT:    sw a2, 20(sp)
 ; RV32-NEXT:    mv a2, a0
-; RV32-NEXT:    bltu a0, a3, .LBB70_2
+; RV32-NEXT:    bltu a0, a1, .LBB70_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a2, 16
 ; RV32-NEXT:  .LBB70_2:
 ; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    addi a3, sp, 40
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, sp, 40
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    mul a4, a4, a5
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
@@ -4017,72 +3972,40 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi a3, sp, 32
 ; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
@@ -4099,16 +4022,13 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi a3, sp, 24
 ; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a3, a3, a5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v16, (a3), zero
+; RV32-NEXT:    addi a3, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vlse64.v v8, (a4), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
+; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
@@ -4118,23 +4038,15 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
@@ -4147,7 +4059,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
@@ -4157,84 +4069,49 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
 ; RV32-NEXT:    vnot.v v16, v16, v0.t
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsub.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v8, v24, 2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -4246,7 +4123,7 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
@@ -4270,73 +4147,73 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v0, 2
-; RV64-NEXT:    mv a2, a0
+; RV64-NEXT:    mv a4, a0
 ; RV64-NEXT:    bltu a0, a1, .LBB70_2
 ; RV64-NEXT:  # %bb.1:
-; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    li a4, 16
 ; RV64-NEXT:  .LBB70_2:
 ; RV64-NEXT:    li a1, 1
-; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV64-NEXT:    lui a2, 349525
+; RV64-NEXT:    lui a3, 209715
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    addiw a2, a2, 1365
+; RV64-NEXT:    addiw a3, a3, 819
+; RV64-NEXT:    addiw a7, a5, -241
+; RV64-NEXT:    addiw t0, a6, 257
+; RV64-NEXT:    slli a6, a2, 32
+; RV64-NEXT:    add a6, a2, a6
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a5, a3, a5
+; RV64-NEXT:    slli a2, a7, 32
+; RV64-NEXT:    add a2, a7, a2
+; RV64-NEXT:    slli a3, t0, 32
+; RV64-NEXT:    add a3, t0, a3
+; RV64-NEXT:    addi a7, a0, -16
+; RV64-NEXT:    sltu a0, a0, a7
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a7, a0, a7
+; RV64-NEXT:    li a0, 56
+; RV64-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    lui a2, 349525
-; RV64-NEXT:    addiw a2, a2, 1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a5, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
-; RV64-NEXT:    addi a7, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a7, a0, -16
-; RV64-NEXT:    sltu a0, a0, a7
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a7
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a7, vlenb
-; RV64-NEXT:    slli a7, a7, 3
-; RV64-NEXT:    add a7, sp, a7
-; RV64-NEXT:    addi a7, a7, 16
-; RV64-NEXT:    vl8r.v v8, (a7) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 3
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
 ; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v16, v8, a5, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
 ; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a6, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a0, vlenb
@@ -4356,105 +4233,102 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 16 * vlenb
-; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 8 * vlenb
 ; RV32-NEXT:    lui a1, 349525
+; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
 ; RV32-NEXT:    sw a1, 40(sp)
 ; RV32-NEXT:    sw a1, 44(sp)
-; RV32-NEXT:    lui a1, 209715
-; RV32-NEXT:    addi a1, a1, 819
-; RV32-NEXT:    sw a1, 32(sp)
-; RV32-NEXT:    sw a1, 36(sp)
 ; RV32-NEXT:    lui a1, 61681
+; RV32-NEXT:    addi a2, a2, 819
+; RV32-NEXT:    sw a2, 32(sp)
+; RV32-NEXT:    sw a2, 36(sp)
+; RV32-NEXT:    lui a2, 4112
 ; RV32-NEXT:    addi a1, a1, -241
 ; RV32-NEXT:    sw a1, 24(sp)
 ; RV32-NEXT:    sw a1, 28(sp)
-; RV32-NEXT:    lui a1, 4112
-; RV32-NEXT:    addi a1, a1, 257
-; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    li a3, 16
+; RV32-NEXT:    addi a1, a2, 257
 ; RV32-NEXT:    sw a1, 16(sp)
 ; RV32-NEXT:    sw a1, 20(sp)
 ; RV32-NEXT:    mv a1, a0
-; RV32-NEXT:    bltu a0, a2, .LBB71_2
+; RV32-NEXT:    bltu a0, a3, .LBB71_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a1, 16
 ; RV32-NEXT:  .LBB71_2:
 ; RV32-NEXT:    li a2, 1
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v24, v8, a2
-; RV32-NEXT:    vnot.v v8, v8
-; RV32-NEXT:    vand.vv v0, v8, v24
+; RV32-NEXT:    vnot.v v0, v8
 ; RV32-NEXT:    addi a3, sp, 40
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v0, 1
-; RV32-NEXT:    vand.vv v24, v24, v16
-; RV32-NEXT:    vsub.vv v24, v0, v24
-; RV32-NEXT:    vand.vv v0, v24, v8
-; RV32-NEXT:    vsrl.vi v24, v24, 2
-; RV32-NEXT:    vand.vv v24, v24, v8
-; RV32-NEXT:    vadd.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 4
-; RV32-NEXT:    vadd.vv v24, v24, v0
-; RV32-NEXT:    addi a3, sp, 48
-; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v24, (a3), zero
 ; RV32-NEXT:    addi a3, a0, -16
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v0, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    addi a3, sp, 32
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v8, v8, a2
+; RV32-NEXT:    vand.vv v8, v0, v8
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vx v0, v16, a2
+; RV32-NEXT:    vnot.v v16, v16
+; RV32-NEXT:    vand.vv v16, v16, v0
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v0, v8, 1
+; RV32-NEXT:    vand.vv v0, v0, v24
+; RV32-NEXT:    vsub.vv v0, v8, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v24, v0, a2
-; RV32-NEXT:    vnot.v v0, v0
-; RV32-NEXT:    vand.vv v24, v0, v24
-; RV32-NEXT:    vsrl.vi v0, v24, 1
-; RV32-NEXT:    vand.vv v16, v0, v16
+; RV32-NEXT:    vsrl.vi v8, v16, 1
+; RV32-NEXT:    vand.vv v24, v8, v24
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    addi a2, sp, 24
-; RV32-NEXT:    vsub.vv v16, v24, v16
-; RV32-NEXT:    vand.vv v24, v16, v8
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v16, v16, v24
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v24, v0, v8
+; RV32-NEXT:    vsrl.vi v0, v0, 2
+; RV32-NEXT:    vand.vv v0, v0, v8
+; RV32-NEXT:    vadd.vv v24, v24, v0
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vand.vv v0, v16, v8
 ; RV32-NEXT:    vsrl.vi v16, v16, 2
 ; RV32-NEXT:    vand.vv v8, v16, v8
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v24, 4
+; RV32-NEXT:    vadd.vv v16, v24, v16
+; RV32-NEXT:    addi a4, sp, 48
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v16, (a2), zero
-; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vlse64.v v24, (a2), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v24, v8
+; RV32-NEXT:    vadd.vv v8, v0, v8
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v24, (a2), zero
+; RV32-NEXT:    vlse64.v v0, (a3), zero
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v0, v8, 4
-; RV32-NEXT:    vadd.vv v8, v8, v0
+; RV32-NEXT:    vsrl.vi v16, v8, 4
+; RV32-NEXT:    vadd.vv v8, v8, v16
 ; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v0, v0, v16
+; RV32-NEXT:    vand.vv v16, v16, v24
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v16, v0, v24
+; RV32-NEXT:    vmul.vv v16, v16, v0
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v24, v8, v24
+; RV32-NEXT:    vmul.vv v24, v8, v0
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v16, a2
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v16, v24, a2
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
 ; RV32-NEXT:    addi sp, sp, 48
@@ -4471,58 +4345,68 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex
 ; RV64-NEXT:  .LBB71_2:
 ; RV64-NEXT:    li a2, 1
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v24, v8, a2
-; RV64-NEXT:    vnot.v v8, v8
-; RV64-NEXT:    vand.vv v8, v8, v24
-; RV64-NEXT:    vsrl.vi v24, v8, 1
-; RV64-NEXT:    lui a1, 349525
-; RV64-NEXT:    addiw a1, a1, 1365
-; RV64-NEXT:    slli a3, a1, 32
-; RV64-NEXT:    add a1, a1, a3
-; RV64-NEXT:    vand.vx v24, v24, a1
-; RV64-NEXT:    vsub.vv v8, v8, v24
-; RV64-NEXT:    lui a3, 209715
-; RV64-NEXT:    addiw a3, a3, 819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    vand.vx v24, v8, a3
-; RV64-NEXT:    vsrl.vi v8, v8, 2
-; RV64-NEXT:    vand.vx v8, v8, a3
-; RV64-NEXT:    vadd.vv v8, v24, v8
-; RV64-NEXT:    vsrl.vi v24, v8, 4
-; RV64-NEXT:    vadd.vv v8, v8, v24
-; RV64-NEXT:    lui a4, 61681
-; RV64-NEXT:    addiw a4, a4, -241
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    vand.vx v8, v8, a4
-; RV64-NEXT:    lui a5, 4112
-; RV64-NEXT:    addiw a5, a5, 257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    vmul.vx v8, v8, a5
-; RV64-NEXT:    li a6, 56
-; RV64-NEXT:    vsrl.vx v8, v8, a6
+; RV64-NEXT:    vnot.v v24, v8
+; RV64-NEXT:    lui a3, 349525
+; RV64-NEXT:    lui a4, 209715
+; RV64-NEXT:    lui a5, 61681
+; RV64-NEXT:    lui a6, 4112
+; RV64-NEXT:    addiw a3, a3, 1365
+; RV64-NEXT:    addiw a4, a4, 819
+; RV64-NEXT:    addiw a5, a5, -241
+; RV64-NEXT:    addiw a6, a6, 257
+; RV64-NEXT:    slli a7, a3, 32
+; RV64-NEXT:    add a3, a3, a7
+; RV64-NEXT:    slli a7, a4, 32
+; RV64-NEXT:    add a4, a4, a7
+; RV64-NEXT:    slli a7, a5, 32
+; RV64-NEXT:    add a5, a5, a7
+; RV64-NEXT:    slli a7, a6, 32
+; RV64-NEXT:    add a6, a6, a7
 ; RV64-NEXT:    addi a7, a0, -16
 ; RV64-NEXT:    sltu a0, a0, a7
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    and a0, a0, a7
+; RV64-NEXT:    li a7, 56
+; RV64-NEXT:    vsub.vx v8, v8, a2
+; RV64-NEXT:    vand.vv v8, v24, v8
+; RV64-NEXT:    vsrl.vi v24, v8, 1
+; RV64-NEXT:    vand.vx v24, v24, a3
+; RV64-NEXT:    vsub.vv v8, v8, v24
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsub.vx v24, v16, a2
 ; RV64-NEXT:    vnot.v v16, v16
 ; RV64-NEXT:    vand.vv v16, v16, v24
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v8, a4
+; RV64-NEXT:    vsrl.vi v8, v8, 2
+; RV64-NEXT:    vand.vx v8, v8, a4
+; RV64-NEXT:    vadd.vv v8, v24, v8
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 1
-; RV64-NEXT:    vand.vx v24, v24, a1
+; RV64-NEXT:    vand.vx v24, v24, a3
 ; RV64-NEXT:    vsub.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v24, v16, a3
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vi v24, v8, 4
+; RV64-NEXT:    vadd.vv v8, v8, v24
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v24, v16, a4
 ; RV64-NEXT:    vsrl.vi v16, v16, 2
-; RV64-NEXT:    vand.vx v16, v16, a3
+; RV64-NEXT:    vand.vx v16, v16, a4
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vand.vx v8, v8, a5
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v24, v16
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vmul.vx v8, v8, a6
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsrl.vi v24, v16, 4
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsrl.vx v8, v8, a7
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vadd.vv v16, v16, v24
-; RV64-NEXT:    vand.vx v16, v16, a4
-; RV64-NEXT:    vmul.vx v16, v16, a5
-; RV64-NEXT:    vsrl.vx v16, v16, a6
+; RV64-NEXT:    vand.vx v16, v16, a5
+; RV64-NEXT:    vmul.vx v16, v16, a6
+; RV64-NEXT:    vsrl.vx v16, v16, a7
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64> %va, i1 true, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x i64> %v

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
index 4b1691aada5bef..57e0eeb92ee2f6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll
@@ -15,13 +15,13 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle8.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v9, v8, a1
+; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    li a1, 51
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
@@ -36,6 +36,7 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RVF-NEXT:    vle8.v v8, (a0)
+; RVF-NEXT:    li a1, 127
 ; RVF-NEXT:    vrsub.vi v9, v8, 0
 ; RVF-NEXT:    vand.vv v9, v8, v9
 ; RVF-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
@@ -44,7 +45,6 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    vnsrl.wi v10, v12, 23
 ; RVF-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; RVF-NEXT:    vnsrl.wi v9, v10, 0
-; RVF-NEXT:    li a1, 127
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
 ; RVF-NEXT:    vsub.vx v8, v9, a1
 ; RVF-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -55,6 +55,7 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RVD-NEXT:    vle8.v v8, (a0)
+; RVD-NEXT:    li a1, 127
 ; RVD-NEXT:    vrsub.vi v9, v8, 0
 ; RVD-NEXT:    vand.vv v9, v8, v9
 ; RVD-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
@@ -63,7 +64,6 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    vnsrl.wi v10, v12, 23
 ; RVD-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; RVD-NEXT:    vnsrl.wi v9, v10, 0
-; RVD-NEXT:    li a1, 127
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
 ; RVD-NEXT:    vsub.vx v8, v9, a1
 ; RVD-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -92,23 +92,23 @@ define void @cttz_v8i16(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle16.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v9, v8, a1
+; RVI-NEXT:    lui a1, 5
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    lui a1, 5
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    lui a1, 3
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 1
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v9, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v9
-; RVI-NEXT:    lui a1, 1
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    li a1, 257
 ; RVI-NEXT:    vmul.vx v8, v8, a1
@@ -120,15 +120,15 @@ define void @cttz_v8i16(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RVF-NEXT:    vle16.v v8, (a0)
-; RVF-NEXT:    vrsub.vi v9, v8, 0
-; RVF-NEXT:    vand.vv v9, v8, v9
-; RVF-NEXT:    vfwcvt.f.xu.v v10, v9
-; RVF-NEXT:    vnsrl.wi v9, v10, 23
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vsub.vx v9, v9, a1
+; RVF-NEXT:    vrsub.vi v9, v8, 0
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
+; RVF-NEXT:    vand.vv v8, v8, v9
+; RVF-NEXT:    vfwcvt.f.xu.v v10, v8
+; RVF-NEXT:    vnsrl.wi v8, v10, 23
+; RVF-NEXT:    vsub.vx v8, v8, a1
 ; RVF-NEXT:    li a1, 16
-; RVF-NEXT:    vmerge.vxm v8, v9, a1, v0
+; RVF-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVF-NEXT:    vse16.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -136,15 +136,15 @@ define void @cttz_v8i16(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RVD-NEXT:    vle16.v v8, (a0)
-; RVD-NEXT:    vrsub.vi v9, v8, 0
-; RVD-NEXT:    vand.vv v9, v8, v9
-; RVD-NEXT:    vfwcvt.f.xu.v v10, v9
-; RVD-NEXT:    vnsrl.wi v9, v10, 23
 ; RVD-NEXT:    li a1, 127
-; RVD-NEXT:    vsub.vx v9, v9, a1
+; RVD-NEXT:    vrsub.vi v9, v8, 0
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
+; RVD-NEXT:    vand.vv v8, v8, v9
+; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
+; RVD-NEXT:    vnsrl.wi v8, v10, 23
+; RVD-NEXT:    vsub.vx v8, v8, a1
 ; RVD-NEXT:    li a1, 16
-; RVD-NEXT:    vmerge.vxm v8, v9, a1, v0
+; RVD-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVD-NEXT:    vse16.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -170,23 +170,23 @@ define void @cttz_v4i32(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle32.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v9, v8, a1
+; RVI-NEXT:    lui a1, 349525
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    lui a1, 349525
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    lui a1, 209715
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 61681
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v9, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v9
-; RVI-NEXT:    lui a1, 61681
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    lui a1, 4112
 ; RVI-NEXT:    addi a1, a1, 257
@@ -199,17 +199,17 @@ define void @cttz_v4i32(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RVF-NEXT:    vle32.v v8, (a0)
+; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vrsub.vi v9, v8, 0
 ; RVF-NEXT:    vand.vv v9, v8, v9
-; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vfcvt.f.xu.v v9, v9
 ; RVF-NEXT:    fsrm a1
-; RVF-NEXT:    vsrl.vi v9, v9, 23
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vsub.vx v9, v9, a1
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
+; RVF-NEXT:    vsrl.vi v8, v9, 23
+; RVF-NEXT:    vsub.vx v8, v8, a1
 ; RVF-NEXT:    li a1, 32
-; RVF-NEXT:    vmerge.vxm v8, v9, a1, v0
+; RVF-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVF-NEXT:    vse32.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -217,16 +217,16 @@ define void @cttz_v4i32(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RVD-NEXT:    vle32.v v8, (a0)
+; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vrsub.vi v9, v8, 0
 ; RVD-NEXT:    vand.vv v9, v8, v9
 ; RVD-NEXT:    vfwcvt.f.xu.v v10, v9
-; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vnsrl.wx v9, v10, a1
 ; RVD-NEXT:    li a1, 1023
-; RVD-NEXT:    vsub.vx v9, v9, a1
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
+; RVD-NEXT:    vsub.vx v8, v9, a1
 ; RVD-NEXT:    li a1, 32
-; RVD-NEXT:    vmerge.vxm v8, v9, a1, v0
+; RVD-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVD-NEXT:    vse32.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -250,40 +250,40 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    li a1, 1
-; RV32I-NEXT:    vsub.vx v9, v8, a1
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
 ; RV32I-NEXT:    lui a1, 349525
 ; RV32I-NEXT:    addi a1, a1, 1365
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
+; RV32I-NEXT:    vmv.v.x v9, a1
+; RV32I-NEXT:    li a1, 1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vsub.vx v10, v8, a1
 ; RV32I-NEXT:    lui a1, 209715
 ; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vand.vv v9, v10, v9
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    lui a1, 4112
 ; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -294,37 +294,37 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
-; RV64I-NEXT:    li a1, 1
-; RV64I-NEXT:    vsub.vx v9, v8, a1
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    addiw a4, a4, 257
+; RV64I-NEXT:    slli a5, a1, 32
+; RV64I-NEXT:    add a1, a1, a5
+; RV64I-NEXT:    slli a5, a2, 32
+; RV64I-NEXT:    add a2, a2, a5
+; RV64I-NEXT:    slli a5, a3, 32
+; RV64I-NEXT:    add a3, a3, a5
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    li a5, 1
+; RV64I-NEXT:    vsub.vx v9, v8, a5
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v9, v9, a1
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v9, v8, a1
+; RV64I-NEXT:    vand.vx v9, v8, a2
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a2
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a3
+; RV64I-NEXT:    vmul.vx v8, v8, a4
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -334,19 +334,21 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVF-NEXT:    vle64.v v8, (a0)
+; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vrsub.vi v9, v8, 0
 ; RVF-NEXT:    vand.vv v9, v8, v9
-; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RVF-NEXT:    vfncvt.f.xu.w v10, v9
 ; RVF-NEXT:    fsrm a1
-; RVF-NEXT:    vsrl.vi v9, v10, 23
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vwsubu.vx v10, v9, a1
 ; RVF-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
+; RVF-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; RVF-NEXT:    vsrl.vi v8, v10, 23
+; RVF-NEXT:    vwsubu.vx v9, v8, a1
 ; RVF-NEXT:    li a1, 64
-; RVF-NEXT:    vmerge.vxm v8, v10, a1, v0
+; RVF-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RVF-NEXT:    vmerge.vxm v8, v9, a1, v0
 ; RVF-NEXT:    vse64.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -354,18 +356,18 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVD-NEXT:    vle64.v v8, (a0)
+; RVD-NEXT:    fsrmi a1, 1
 ; RVD-NEXT:    vrsub.vi v9, v8, 0
 ; RVD-NEXT:    vand.vv v9, v8, v9
-; RVD-NEXT:    fsrmi a1, 1
 ; RVD-NEXT:    vfcvt.f.xu.v v9, v9
 ; RVD-NEXT:    fsrm a1
 ; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vsrl.vx v9, v9, a1
 ; RVD-NEXT:    li a1, 1023
-; RVD-NEXT:    vsub.vx v9, v9, a1
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
+; RVD-NEXT:    vsub.vx v8, v9, a1
 ; RVD-NEXT:    li a1, 64
-; RVD-NEXT:    vmerge.vxm v8, v9, a1, v0
+; RVD-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVD-NEXT:    vse64.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -392,13 +394,13 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle8.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v10, v8, a1
+; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    li a1, 51
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
@@ -414,6 +416,7 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    li a1, 32
 ; RVF-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; RVF-NEXT:    vle8.v v8, (a0)
+; RVF-NEXT:    li a1, 127
 ; RVF-NEXT:    vrsub.vi v10, v8, 0
 ; RVF-NEXT:    vand.vv v10, v8, v10
 ; RVF-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
@@ -422,7 +425,6 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVF-NEXT:    vnsrl.wi v12, v16, 23
 ; RVF-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; RVF-NEXT:    vnsrl.wi v10, v12, 0
-; RVF-NEXT:    li a1, 127
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
 ; RVF-NEXT:    vsub.vx v8, v10, a1
 ; RVF-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -434,6 +436,7 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    li a1, 32
 ; RVD-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; RVD-NEXT:    vle8.v v8, (a0)
+; RVD-NEXT:    li a1, 127
 ; RVD-NEXT:    vrsub.vi v10, v8, 0
 ; RVD-NEXT:    vand.vv v10, v8, v10
 ; RVD-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
@@ -442,7 +445,6 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind {
 ; RVD-NEXT:    vnsrl.wi v12, v16, 23
 ; RVD-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; RVD-NEXT:    vnsrl.wi v10, v12, 0
-; RVD-NEXT:    li a1, 127
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
 ; RVD-NEXT:    vsub.vx v8, v10, a1
 ; RVD-NEXT:    vmerge.vim v8, v8, 8, v0
@@ -472,23 +474,23 @@ define void @cttz_v16i16(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle16.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v10, v8, a1
+; RVI-NEXT:    lui a1, 5
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    lui a1, 5
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    lui a1, 3
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 1
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v10, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v10
-; RVI-NEXT:    lui a1, 1
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    li a1, 257
 ; RVI-NEXT:    vmul.vx v8, v8, a1
@@ -500,15 +502,15 @@ define void @cttz_v16i16(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVF-NEXT:    vle16.v v8, (a0)
-; RVF-NEXT:    vrsub.vi v10, v8, 0
-; RVF-NEXT:    vand.vv v10, v8, v10
-; RVF-NEXT:    vfwcvt.f.xu.v v12, v10
-; RVF-NEXT:    vnsrl.wi v10, v12, 23
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vsub.vx v10, v10, a1
+; RVF-NEXT:    vrsub.vi v10, v8, 0
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
+; RVF-NEXT:    vand.vv v8, v8, v10
+; RVF-NEXT:    vfwcvt.f.xu.v v12, v8
+; RVF-NEXT:    vnsrl.wi v8, v12, 23
+; RVF-NEXT:    vsub.vx v8, v8, a1
 ; RVF-NEXT:    li a1, 16
-; RVF-NEXT:    vmerge.vxm v8, v10, a1, v0
+; RVF-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVF-NEXT:    vse16.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -516,15 +518,15 @@ define void @cttz_v16i16(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RVD-NEXT:    vle16.v v8, (a0)
-; RVD-NEXT:    vrsub.vi v10, v8, 0
-; RVD-NEXT:    vand.vv v10, v8, v10
-; RVD-NEXT:    vfwcvt.f.xu.v v12, v10
-; RVD-NEXT:    vnsrl.wi v10, v12, 23
 ; RVD-NEXT:    li a1, 127
-; RVD-NEXT:    vsub.vx v10, v10, a1
+; RVD-NEXT:    vrsub.vi v10, v8, 0
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
+; RVD-NEXT:    vand.vv v8, v8, v10
+; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
+; RVD-NEXT:    vnsrl.wi v8, v12, 23
+; RVD-NEXT:    vsub.vx v8, v8, a1
 ; RVD-NEXT:    li a1, 16
-; RVD-NEXT:    vmerge.vxm v8, v10, a1, v0
+; RVD-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVD-NEXT:    vse16.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -550,23 +552,23 @@ define void @cttz_v8i32(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle32.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v10, v8, a1
+; RVI-NEXT:    lui a1, 349525
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    lui a1, 349525
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    lui a1, 209715
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 61681
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v10, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v10
-; RVI-NEXT:    lui a1, 61681
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    lui a1, 4112
 ; RVI-NEXT:    addi a1, a1, 257
@@ -579,17 +581,17 @@ define void @cttz_v8i32(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RVF-NEXT:    vle32.v v8, (a0)
+; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vrsub.vi v10, v8, 0
 ; RVF-NEXT:    vand.vv v10, v8, v10
-; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vfcvt.f.xu.v v10, v10
 ; RVF-NEXT:    fsrm a1
-; RVF-NEXT:    vsrl.vi v10, v10, 23
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vsub.vx v10, v10, a1
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
+; RVF-NEXT:    vsrl.vi v8, v10, 23
+; RVF-NEXT:    vsub.vx v8, v8, a1
 ; RVF-NEXT:    li a1, 32
-; RVF-NEXT:    vmerge.vxm v8, v10, a1, v0
+; RVF-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVF-NEXT:    vse32.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -597,16 +599,16 @@ define void @cttz_v8i32(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RVD-NEXT:    vle32.v v8, (a0)
+; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vrsub.vi v10, v8, 0
 ; RVD-NEXT:    vand.vv v10, v8, v10
 ; RVD-NEXT:    vfwcvt.f.xu.v v12, v10
-; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vnsrl.wx v10, v12, a1
 ; RVD-NEXT:    li a1, 1023
-; RVD-NEXT:    vsub.vx v10, v10, a1
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
+; RVD-NEXT:    vsub.vx v8, v10, a1
 ; RVD-NEXT:    li a1, 32
-; RVD-NEXT:    vmerge.vxm v8, v10, a1, v0
+; RVD-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVD-NEXT:    vse32.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -630,40 +632,40 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    li a1, 1
-; RV32I-NEXT:    vsub.vx v10, v8, a1
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
 ; RV32I-NEXT:    lui a1, 349525
 ; RV32I-NEXT:    addi a1, a1, 1365
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a1
+; RV32I-NEXT:    vmv.v.x v10, a1
+; RV32I-NEXT:    li a1, 1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vsub.vx v12, v8, a1
 ; RV32I-NEXT:    lui a1, 209715
 ; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vand.vv v10, v12, v10
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a1
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32I-NEXT:    vmv.v.x v12, a1
 ; RV32I-NEXT:    lui a1, 4112
 ; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -674,37 +676,37 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
-; RV64I-NEXT:    li a1, 1
-; RV64I-NEXT:    vsub.vx v10, v8, a1
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    addiw a4, a4, 257
+; RV64I-NEXT:    slli a5, a1, 32
+; RV64I-NEXT:    add a1, a1, a5
+; RV64I-NEXT:    slli a5, a2, 32
+; RV64I-NEXT:    add a2, a2, a5
+; RV64I-NEXT:    slli a5, a3, 32
+; RV64I-NEXT:    add a3, a3, a5
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    li a5, 1
+; RV64I-NEXT:    vsub.vx v10, v8, a5
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v10, v10, a1
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v10, v8, a1
+; RV64I-NEXT:    vand.vx v10, v8, a2
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a2
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a3
+; RV64I-NEXT:    vmul.vx v8, v8, a4
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -714,19 +716,21 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RVF-NEXT:    vle64.v v8, (a0)
+; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vrsub.vi v10, v8, 0
 ; RVF-NEXT:    vand.vv v10, v8, v10
-; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RVF-NEXT:    vfncvt.f.xu.w v12, v10
 ; RVF-NEXT:    fsrm a1
-; RVF-NEXT:    vsrl.vi v10, v12, 23
 ; RVF-NEXT:    li a1, 127
-; RVF-NEXT:    vwsubu.vx v12, v10, a1
 ; RVF-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; RVF-NEXT:    vmseq.vi v0, v8, 0
+; RVF-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RVF-NEXT:    vsrl.vi v8, v12, 23
+; RVF-NEXT:    vwsubu.vx v10, v8, a1
 ; RVF-NEXT:    li a1, 64
-; RVF-NEXT:    vmerge.vxm v8, v12, a1, v0
+; RVF-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RVF-NEXT:    vmerge.vxm v8, v10, a1, v0
 ; RVF-NEXT:    vse64.v v8, (a0)
 ; RVF-NEXT:    ret
 ;
@@ -734,18 +738,18 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RVD-NEXT:    vle64.v v8, (a0)
+; RVD-NEXT:    fsrmi a1, 1
 ; RVD-NEXT:    vrsub.vi v10, v8, 0
 ; RVD-NEXT:    vand.vv v10, v8, v10
-; RVD-NEXT:    fsrmi a1, 1
 ; RVD-NEXT:    vfcvt.f.xu.v v10, v10
 ; RVD-NEXT:    fsrm a1
 ; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vsrl.vx v10, v10, a1
 ; RVD-NEXT:    li a1, 1023
-; RVD-NEXT:    vsub.vx v10, v10, a1
 ; RVD-NEXT:    vmseq.vi v0, v8, 0
+; RVD-NEXT:    vsub.vx v8, v10, a1
 ; RVD-NEXT:    li a1, 64
-; RVD-NEXT:    vmerge.vxm v8, v10, a1, v0
+; RVD-NEXT:    vmerge.vxm v8, v8, a1, v0
 ; RVD-NEXT:    vse64.v v8, (a0)
 ; RVD-NEXT:    ret
 ;
@@ -771,13 +775,13 @@ define void @cttz_zero_undef_v16i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle8.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v9, v8, a1
+; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    li a1, 51
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
@@ -843,23 +847,23 @@ define void @cttz_zero_undef_v8i16(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle16.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v9, v8, a1
+; RVI-NEXT:    lui a1, 5
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    lui a1, 5
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    lui a1, 3
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 1
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v9, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v9
-; RVI-NEXT:    lui a1, 1
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    li a1, 257
 ; RVI-NEXT:    vmul.vx v8, v8, a1
@@ -914,23 +918,23 @@ define void @cttz_zero_undef_v4i32(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle32.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v9, v8, a1
+; RVI-NEXT:    lui a1, 349525
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v9
 ; RVI-NEXT:    vsrl.vi v9, v8, 1
-; RVI-NEXT:    lui a1, 349525
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v9, v9, a1
-; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    lui a1, 209715
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v9
 ; RVI-NEXT:    vand.vx v9, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 61681
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v9, v8
 ; RVI-NEXT:    vsrl.vi v9, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v9
-; RVI-NEXT:    lui a1, 61681
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    lui a1, 4112
 ; RVI-NEXT:    addi a1, a1, 257
@@ -943,9 +947,9 @@ define void @cttz_zero_undef_v4i32(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RVF-NEXT:    vle32.v v8, (a0)
+; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vrsub.vi v9, v8, 0
 ; RVF-NEXT:    vand.vv v8, v8, v9
-; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vfcvt.f.xu.v v8, v8
 ; RVF-NEXT:    fsrm a1
 ; RVF-NEXT:    vsrl.vi v8, v8, 23
@@ -958,10 +962,10 @@ define void @cttz_zero_undef_v4i32(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RVD-NEXT:    vle32.v v8, (a0)
+; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vrsub.vi v9, v8, 0
 ; RVD-NEXT:    vand.vv v8, v8, v9
 ; RVD-NEXT:    vfwcvt.f.xu.v v10, v8
-; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vnsrl.wx v8, v10, a1
 ; RVD-NEXT:    li a1, 1023
 ; RVD-NEXT:    vsub.vx v8, v8, a1
@@ -987,40 +991,40 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    li a1, 1
-; RV32I-NEXT:    vsub.vx v9, v8, a1
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vsrl.vi v9, v8, 1
 ; RV32I-NEXT:    lui a1, 349525
 ; RV32I-NEXT:    addi a1, a1, 1365
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
+; RV32I-NEXT:    vmv.v.x v9, a1
+; RV32I-NEXT:    li a1, 1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v9, v9, v10
-; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vsub.vx v10, v8, a1
 ; RV32I-NEXT:    lui a1, 209715
 ; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsrl.vi v10, v8, 1
+; RV32I-NEXT:    vand.vv v9, v10, v9
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
-; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v10, v8, v9
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v9
-; RV32I-NEXT:    vadd.vv v8, v10, v8
-; RV32I-NEXT:    vsrl.vi v9, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v9
+; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32I-NEXT:    vmv.v.x v9, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v9
+; RV32I-NEXT:    vsub.vv v8, v8, v9
+; RV32I-NEXT:    vand.vv v9, v8, v10
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    lui a1, 4112
 ; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v9, v8
+; RV32I-NEXT:    vsrl.vi v9, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v9
 ; RV32I-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32I-NEXT:    vmv.v.x v9, a1
 ; RV32I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v10
 ; RV32I-NEXT:    vmul.vv v8, v8, v9
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -1031,37 +1035,37 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
-; RV64I-NEXT:    li a1, 1
-; RV64I-NEXT:    vsub.vx v9, v8, a1
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    addiw a4, a4, 257
+; RV64I-NEXT:    slli a5, a1, 32
+; RV64I-NEXT:    add a1, a1, a5
+; RV64I-NEXT:    slli a5, a2, 32
+; RV64I-NEXT:    add a2, a2, a5
+; RV64I-NEXT:    slli a5, a3, 32
+; RV64I-NEXT:    add a3, a3, a5
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    li a5, 1
+; RV64I-NEXT:    vsub.vx v9, v8, a5
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v9
 ; RV64I-NEXT:    vsrl.vi v9, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v9, v9, a1
 ; RV64I-NEXT:    vsub.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v9, v8, a1
+; RV64I-NEXT:    vand.vx v9, v8, a2
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a2
 ; RV64I-NEXT:    vadd.vv v8, v9, v8
 ; RV64I-NEXT:    vsrl.vi v9, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v9
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a3
+; RV64I-NEXT:    vmul.vx v8, v8, a4
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -1071,9 +1075,9 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVF-NEXT:    vle64.v v8, (a0)
+; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vrsub.vi v9, v8, 0
 ; RVF-NEXT:    vand.vv v8, v8, v9
-; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RVF-NEXT:    vfncvt.f.xu.w v9, v8
 ; RVF-NEXT:    fsrm a1
@@ -1087,9 +1091,9 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVD-NEXT:    vle64.v v8, (a0)
+; RVD-NEXT:    fsrmi a1, 1
 ; RVD-NEXT:    vrsub.vi v9, v8, 0
 ; RVD-NEXT:    vand.vv v8, v8, v9
-; RVD-NEXT:    fsrmi a1, 1
 ; RVD-NEXT:    vfcvt.f.xu.v v8, v8
 ; RVD-NEXT:    fsrm a1
 ; RVD-NEXT:    li a1, 52
@@ -1121,13 +1125,13 @@ define void @cttz_zero_undef_v32i8(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle8.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v10, v8, a1
+; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    li a1, 85
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    li a1, 51
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
@@ -1196,23 +1200,23 @@ define void @cttz_zero_undef_v16i16(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle16.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v10, v8, a1
+; RVI-NEXT:    lui a1, 5
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    lui a1, 5
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    lui a1, 3
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 1
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v10, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v10
-; RVI-NEXT:    lui a1, 1
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    li a1, 257
 ; RVI-NEXT:    vmul.vx v8, v8, a1
@@ -1267,23 +1271,23 @@ define void @cttz_zero_undef_v8i32(ptr %x, ptr %y) nounwind {
 ; RVI-NEXT:    vle32.v v8, (a0)
 ; RVI-NEXT:    li a1, 1
 ; RVI-NEXT:    vsub.vx v10, v8, a1
+; RVI-NEXT:    lui a1, 349525
+; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vnot.v v8, v8
 ; RVI-NEXT:    vand.vv v8, v8, v10
 ; RVI-NEXT:    vsrl.vi v10, v8, 1
-; RVI-NEXT:    lui a1, 349525
-; RVI-NEXT:    addi a1, a1, 1365
 ; RVI-NEXT:    vand.vx v10, v10, a1
-; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    lui a1, 209715
 ; RVI-NEXT:    addi a1, a1, 819
+; RVI-NEXT:    vsub.vv v8, v8, v10
 ; RVI-NEXT:    vand.vx v10, v8, a1
 ; RVI-NEXT:    vsrl.vi v8, v8, 2
 ; RVI-NEXT:    vand.vx v8, v8, a1
+; RVI-NEXT:    lui a1, 61681
+; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vadd.vv v8, v10, v8
 ; RVI-NEXT:    vsrl.vi v10, v8, 4
 ; RVI-NEXT:    vadd.vv v8, v8, v10
-; RVI-NEXT:    lui a1, 61681
-; RVI-NEXT:    addi a1, a1, -241
 ; RVI-NEXT:    vand.vx v8, v8, a1
 ; RVI-NEXT:    lui a1, 4112
 ; RVI-NEXT:    addi a1, a1, 257
@@ -1296,9 +1300,9 @@ define void @cttz_zero_undef_v8i32(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RVF-NEXT:    vle32.v v8, (a0)
+; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vrsub.vi v10, v8, 0
 ; RVF-NEXT:    vand.vv v8, v8, v10
-; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vfcvt.f.xu.v v8, v8
 ; RVF-NEXT:    fsrm a1
 ; RVF-NEXT:    vsrl.vi v8, v8, 23
@@ -1311,10 +1315,10 @@ define void @cttz_zero_undef_v8i32(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RVD-NEXT:    vle32.v v8, (a0)
+; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vrsub.vi v10, v8, 0
 ; RVD-NEXT:    vand.vv v8, v8, v10
 ; RVD-NEXT:    vfwcvt.f.xu.v v12, v8
-; RVD-NEXT:    li a1, 52
 ; RVD-NEXT:    vnsrl.wx v8, v12, a1
 ; RVD-NEXT:    li a1, 1023
 ; RVD-NEXT:    vsub.vx v8, v8, a1
@@ -1340,40 +1344,40 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32I-NEXT:    vle64.v v8, (a0)
-; RV32I-NEXT:    li a1, 1
-; RV32I-NEXT:    vsub.vx v10, v8, a1
-; RV32I-NEXT:    vnot.v v8, v8
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vsrl.vi v10, v8, 1
 ; RV32I-NEXT:    lui a1, 349525
 ; RV32I-NEXT:    addi a1, a1, 1365
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v12, a1
+; RV32I-NEXT:    vmv.v.x v10, a1
+; RV32I-NEXT:    li a1, 1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v10, v10, v12
-; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vsub.vx v12, v8, a1
 ; RV32I-NEXT:    lui a1, 209715
 ; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    vnot.v v8, v8
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsrl.vi v12, v8, 1
+; RV32I-NEXT:    vand.vv v10, v12, v10
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
-; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v12, v8, v10
-; RV32I-NEXT:    vsrl.vi v8, v8, 2
-; RV32I-NEXT:    vand.vv v8, v8, v10
-; RV32I-NEXT:    vadd.vv v8, v12, v8
-; RV32I-NEXT:    vsrl.vi v10, v8, 4
-; RV32I-NEXT:    vadd.vv v8, v8, v10
+; RV32I-NEXT:    vmv.v.x v12, a1
 ; RV32I-NEXT:    lui a1, 61681
 ; RV32I-NEXT:    addi a1, a1, -241
-; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32I-NEXT:    vand.vv v8, v8, v10
+; RV32I-NEXT:    vsub.vv v8, v8, v10
+; RV32I-NEXT:    vand.vv v10, v8, v12
+; RV32I-NEXT:    vsrl.vi v8, v8, 2
+; RV32I-NEXT:    vand.vv v8, v8, v12
+; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32I-NEXT:    vmv.v.x v12, a1
 ; RV32I-NEXT:    lui a1, 4112
 ; RV32I-NEXT:    addi a1, a1, 257
+; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vadd.vv v8, v10, v8
+; RV32I-NEXT:    vsrl.vi v10, v8, 4
+; RV32I-NEXT:    vadd.vv v8, v8, v10
 ; RV32I-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32I-NEXT:    vmv.v.x v10, a1
 ; RV32I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32I-NEXT:    vand.vv v8, v8, v12
 ; RV32I-NEXT:    vmul.vv v8, v8, v10
 ; RV32I-NEXT:    li a1, 56
 ; RV32I-NEXT:    vsrl.vx v8, v8, a1
@@ -1384,37 +1388,37 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64I-NEXT:    vle64.v v8, (a0)
-; RV64I-NEXT:    li a1, 1
-; RV64I-NEXT:    vsub.vx v10, v8, a1
+; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    addiw a2, a2, 819
+; RV64I-NEXT:    addiw a3, a3, -241
+; RV64I-NEXT:    addiw a4, a4, 257
+; RV64I-NEXT:    slli a5, a1, 32
+; RV64I-NEXT:    add a1, a1, a5
+; RV64I-NEXT:    slli a5, a2, 32
+; RV64I-NEXT:    add a2, a2, a5
+; RV64I-NEXT:    slli a5, a3, 32
+; RV64I-NEXT:    add a3, a3, a5
+; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    add a4, a4, a5
+; RV64I-NEXT:    li a5, 1
+; RV64I-NEXT:    vsub.vx v10, v8, a5
 ; RV64I-NEXT:    vnot.v v8, v8
 ; RV64I-NEXT:    vand.vv v8, v8, v10
 ; RV64I-NEXT:    vsrl.vi v10, v8, 1
-; RV64I-NEXT:    lui a1, 349525
-; RV64I-NEXT:    addiw a1, a1, 1365
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    vand.vx v10, v10, a1
 ; RV64I-NEXT:    vsub.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v10, v8, a1
+; RV64I-NEXT:    vand.vx v10, v8, a2
 ; RV64I-NEXT:    vsrl.vi v8, v8, 2
-; RV64I-NEXT:    vand.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a2
 ; RV64I-NEXT:    vadd.vv v8, v10, v8
 ; RV64I-NEXT:    vsrl.vi v10, v8, 4
 ; RV64I-NEXT:    vadd.vv v8, v8, v10
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw a1, a1, -241
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vand.vx v8, v8, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw a1, a1, 257
-; RV64I-NEXT:    slli a2, a1, 32
-; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    vmul.vx v8, v8, a1
+; RV64I-NEXT:    vand.vx v8, v8, a3
+; RV64I-NEXT:    vmul.vx v8, v8, a4
 ; RV64I-NEXT:    li a1, 56
 ; RV64I-NEXT:    vsrl.vx v8, v8, a1
 ; RV64I-NEXT:    vse64.v v8, (a0)
@@ -1424,9 +1428,9 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RVF:       # %bb.0:
 ; RVF-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RVF-NEXT:    vle64.v v8, (a0)
+; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vrsub.vi v10, v8, 0
 ; RVF-NEXT:    vand.vv v8, v8, v10
-; RVF-NEXT:    fsrmi a1, 1
 ; RVF-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RVF-NEXT:    vfncvt.f.xu.w v10, v8
 ; RVF-NEXT:    fsrm a1
@@ -1440,9 +1444,9 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; RVD:       # %bb.0:
 ; RVD-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RVD-NEXT:    vle64.v v8, (a0)
+; RVD-NEXT:    fsrmi a1, 1
 ; RVD-NEXT:    vrsub.vi v10, v8, 0
 ; RVD-NEXT:    vand.vv v8, v8, v10
-; RVD-NEXT:    fsrmi a1, 1
 ; RVD-NEXT:    vfcvt.f.xu.v v8, v8
 ; RVD-NEXT:    fsrm a1
 ; RVD-NEXT:    li a1, 52

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index 060a5c4224fe15..ae5dbfa4bf30ba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -10,30 +10,33 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) {
 ; CHECK-LABEL: vector_deinterleave_load_v16i1_v32i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vlm.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v8, 2
+; CHECK-NEXT:    li a0, -256
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v10, v9, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v12, v9, 1, v0
-; CHECK-NEXT:    vnsrl.wi v8, v12, 0
-; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    vadd.vv v11, v9, v9
-; CHECK-NEXT:    li a0, -256
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; CHECK-NEXT:    vadd.vi v9, v11, -16
-; CHECK-NEXT:    vrgather.vv v8, v10, v9, v0.t
-; CHECK-NEXT:    vmsne.vi v9, v8, 0
-; CHECK-NEXT:    vnsrl.wi v8, v12, 8
+; CHECK-NEXT:    vmv.s.x v9, a0
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vadd.vi v12, v11, -16
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v8, 2
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; CHECK-NEXT:    vadd.vi v11, v11, -15
-; CHECK-NEXT:    vrgather.vv v8, v10, v11, v0.t
-; CHECK-NEXT:    vmsne.vi v8, v8, 0
-; CHECK-NEXT:    vmv.v.v v0, v9
+; CHECK-NEXT:    vmerge.vim v13, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v14, v10, 1, v0
+; CHECK-NEXT:    vnsrl.wi v8, v14, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vrgather.vv v8, v13, v12, v0.t
+; CHECK-NEXT:    vnsrl.wi v12, v14, 8
+; CHECK-NEXT:    vmsne.vi v10, v8, 0
+; CHECK-NEXT:    vrgather.vv v12, v13, v11, v0.t
+; CHECK-NEXT:    vmsne.vi v8, v12, 0
+; CHECK-NEXT:    vmv.v.v v0, v10
 ; CHECK-NEXT:    ret
   %vec = load <32 x i1>, ptr %p
   %retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
index 04ebc7ca6b2b89..e13f4f4b50b0ff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll
@@ -32,20 +32,20 @@ define void @add_v2i64(ptr %x, ptr %y) {
 ; RV32-NEXT:    lw a5, 4(a0)
 ; RV32-NEXT:    lw a6, 8(a0)
 ; RV32-NEXT:    lw a7, 12(a0)
-; RV32-NEXT:    lw t0, 8(a1)
-; RV32-NEXT:    lw a1, 12(a1)
+; RV32-NEXT:    lw t0, 12(a1)
+; RV32-NEXT:    lw a1, 8(a1)
 ; RV32-NEXT:    add a3, a5, a3
 ; RV32-NEXT:    add a2, a4, a2
+; RV32-NEXT:    add a7, a7, t0
+; RV32-NEXT:    add a1, a6, a1
 ; RV32-NEXT:    sltu a4, a2, a4
+; RV32-NEXT:    sltu a5, a1, a6
 ; RV32-NEXT:    add a3, a3, a4
-; RV32-NEXT:    add a1, a7, a1
-; RV32-NEXT:    add t0, a6, t0
-; RV32-NEXT:    sltu a4, t0, a6
-; RV32-NEXT:    add a1, a1, a4
+; RV32-NEXT:    add a5, a7, a5
 ; RV32-NEXT:    sw a2, 0(a0)
 ; RV32-NEXT:    sw a3, 4(a0)
-; RV32-NEXT:    sw t0, 8(a0)
-; RV32-NEXT:    sw a1, 12(a0)
+; RV32-NEXT:    sw a1, 8(a0)
+; RV32-NEXT:    sw a5, 12(a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: add_v2i64:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
index 493481ad129d29..e53876d69b59b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll
@@ -325,20 +325,20 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV32-NEXT:    addi s0, sp, 384
 ; RV32-NEXT:    andi sp, sp, -128
 ; RV32-NEXT:    andi a1, a1, 255
-; RV32-NEXT:    li a2, 128
-; RV32-NEXT:    addi a3, a0, 128
-; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV32-NEXT:    vle8.v v16, (a3)
-; RV32-NEXT:    vle8.v v24, (a0)
-; RV32-NEXT:    mv a0, sp
-; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    mv a2, sp
+; RV32-NEXT:    li a3, 128
+; RV32-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; RV32-NEXT:    vle8.v v8, (a0)
+; RV32-NEXT:    addi a0, a0, 128
+; RV32-NEXT:    vle8.v v16, (a0)
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    vmseq.vi v0, v8, 0
+; RV32-NEXT:    vmv.v.i v24, 0
 ; RV32-NEXT:    vmseq.vi v8, v16, 0
-; RV32-NEXT:    vmseq.vi v0, v24, 0
-; RV32-NEXT:    vmv.v.i v16, 0
-; RV32-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV32-NEXT:    vse8.v v24, (a0)
+; RV32-NEXT:    vmerge.vim v16, v24, 1, v0
+; RV32-NEXT:    vse8.v v16, (a2)
 ; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vmerge.vim v8, v16, 1, v0
+; RV32-NEXT:    vmerge.vim v8, v24, 1, v0
 ; RV32-NEXT:    addi a0, sp, 128
 ; RV32-NEXT:    vse8.v v8, (a0)
 ; RV32-NEXT:    lbu a0, 0(a1)
@@ -356,20 +356,20 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV64-NEXT:    addi s0, sp, 384
 ; RV64-NEXT:    andi sp, sp, -128
 ; RV64-NEXT:    andi a1, a1, 255
-; RV64-NEXT:    li a2, 128
-; RV64-NEXT:    addi a3, a0, 128
-; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV64-NEXT:    vle8.v v16, (a3)
-; RV64-NEXT:    vle8.v v24, (a0)
-; RV64-NEXT:    mv a0, sp
-; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    mv a2, sp
+; RV64-NEXT:    li a3, 128
+; RV64-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; RV64-NEXT:    vle8.v v8, (a0)
+; RV64-NEXT:    addi a0, a0, 128
+; RV64-NEXT:    vle8.v v16, (a0)
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    vmseq.vi v0, v8, 0
+; RV64-NEXT:    vmv.v.i v24, 0
 ; RV64-NEXT:    vmseq.vi v8, v16, 0
-; RV64-NEXT:    vmseq.vi v0, v24, 0
-; RV64-NEXT:    vmv.v.i v16, 0
-; RV64-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV64-NEXT:    vse8.v v24, (a0)
+; RV64-NEXT:    vmerge.vim v16, v24, 1, v0
+; RV64-NEXT:    vse8.v v16, (a2)
 ; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vmerge.vim v8, v16, 1, v0
+; RV64-NEXT:    vmerge.vim v8, v24, 1, v0
 ; RV64-NEXT:    addi a0, sp, 128
 ; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    lbu a0, 0(a1)
@@ -387,20 +387,20 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV32ZBS-NEXT:    addi s0, sp, 384
 ; RV32ZBS-NEXT:    andi sp, sp, -128
 ; RV32ZBS-NEXT:    andi a1, a1, 255
-; RV32ZBS-NEXT:    li a2, 128
-; RV32ZBS-NEXT:    addi a3, a0, 128
-; RV32ZBS-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV32ZBS-NEXT:    vle8.v v16, (a3)
-; RV32ZBS-NEXT:    vle8.v v24, (a0)
-; RV32ZBS-NEXT:    mv a0, sp
-; RV32ZBS-NEXT:    add a1, a0, a1
+; RV32ZBS-NEXT:    mv a2, sp
+; RV32ZBS-NEXT:    li a3, 128
+; RV32ZBS-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; RV32ZBS-NEXT:    vle8.v v8, (a0)
+; RV32ZBS-NEXT:    addi a0, a0, 128
+; RV32ZBS-NEXT:    vle8.v v16, (a0)
+; RV32ZBS-NEXT:    add a1, a2, a1
+; RV32ZBS-NEXT:    vmseq.vi v0, v8, 0
+; RV32ZBS-NEXT:    vmv.v.i v24, 0
 ; RV32ZBS-NEXT:    vmseq.vi v8, v16, 0
-; RV32ZBS-NEXT:    vmseq.vi v0, v24, 0
-; RV32ZBS-NEXT:    vmv.v.i v16, 0
-; RV32ZBS-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV32ZBS-NEXT:    vse8.v v24, (a0)
+; RV32ZBS-NEXT:    vmerge.vim v16, v24, 1, v0
+; RV32ZBS-NEXT:    vse8.v v16, (a2)
 ; RV32ZBS-NEXT:    vmv1r.v v0, v8
-; RV32ZBS-NEXT:    vmerge.vim v8, v16, 1, v0
+; RV32ZBS-NEXT:    vmerge.vim v8, v24, 1, v0
 ; RV32ZBS-NEXT:    addi a0, sp, 128
 ; RV32ZBS-NEXT:    vse8.v v8, (a0)
 ; RV32ZBS-NEXT:    lbu a0, 0(a1)
@@ -418,20 +418,20 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind {
 ; RV64ZBS-NEXT:    addi s0, sp, 384
 ; RV64ZBS-NEXT:    andi sp, sp, -128
 ; RV64ZBS-NEXT:    andi a1, a1, 255
-; RV64ZBS-NEXT:    li a2, 128
-; RV64ZBS-NEXT:    addi a3, a0, 128
-; RV64ZBS-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV64ZBS-NEXT:    vle8.v v16, (a3)
-; RV64ZBS-NEXT:    vle8.v v24, (a0)
-; RV64ZBS-NEXT:    mv a0, sp
-; RV64ZBS-NEXT:    add a1, a0, a1
+; RV64ZBS-NEXT:    mv a2, sp
+; RV64ZBS-NEXT:    li a3, 128
+; RV64ZBS-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; RV64ZBS-NEXT:    vle8.v v8, (a0)
+; RV64ZBS-NEXT:    addi a0, a0, 128
+; RV64ZBS-NEXT:    vle8.v v16, (a0)
+; RV64ZBS-NEXT:    add a1, a2, a1
+; RV64ZBS-NEXT:    vmseq.vi v0, v8, 0
+; RV64ZBS-NEXT:    vmv.v.i v24, 0
 ; RV64ZBS-NEXT:    vmseq.vi v8, v16, 0
-; RV64ZBS-NEXT:    vmseq.vi v0, v24, 0
-; RV64ZBS-NEXT:    vmv.v.i v16, 0
-; RV64ZBS-NEXT:    vmerge.vim v24, v16, 1, v0
-; RV64ZBS-NEXT:    vse8.v v24, (a0)
+; RV64ZBS-NEXT:    vmerge.vim v16, v24, 1, v0
+; RV64ZBS-NEXT:    vse8.v v16, (a2)
 ; RV64ZBS-NEXT:    vmv1r.v v0, v8
-; RV64ZBS-NEXT:    vmerge.vim v8, v16, 1, v0
+; RV64ZBS-NEXT:    vmerge.vim v8, v24, 1, v0
 ; RV64ZBS-NEXT:    addi a0, sp, 128
 ; RV64ZBS-NEXT:    vse8.v v8, (a0)
 ; RV64ZBS-NEXT:    lbu a0, 0(a1)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
index fdee80fb95627e..e9dca2c42e835b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
@@ -607,9 +607,9 @@ define void @extract_v2i1_v64i1_42(ptr %x, ptr %y) {
 ; VLA-NEXT:    li a2, 64
 ; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
 ; VLA-NEXT:    vlm.v v0, (a0)
+; VLA-NEXT:    li a0, 42
 ; VLA-NEXT:    vmv.v.i v8, 0
 ; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
-; VLA-NEXT:    li a0, 42
 ; VLA-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
 ; VLA-NEXT:    vslidedown.vx v8, v8, a0
 ; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
@@ -764,8 +764,8 @@ define void @extract_v2i1_nxv64i1_42(<vscale x 64 x i1> %x, ptr %y) {
 ; VLA:       # %bb.0:
 ; VLA-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
 ; VLA-NEXT:    vmv.v.i v8, 0
-; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
 ; VLA-NEXT:    li a1, 42
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
 ; VLA-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
 ; VLA-NEXT:    vslidedown.vx v8, v8, a1
 ; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
index cb830d668d2e8c..a193d4e4e689f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll
@@ -177,8 +177,8 @@ define i64 @extractelt_v4i64(ptr %x) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vx v10, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v10
@@ -273,8 +273,8 @@ define i64 @extractelt_v3i64(ptr %x) nounwind {
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 4
-; RV32-NEXT:    vmv.x.s a0, v10
 ; RV32-NEXT:    vslidedown.vi v8, v8, 5
+; RV32-NEXT:    vmv.x.s a0, v10
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
 ;
@@ -426,12 +426,12 @@ define i64 @extractelt_v2i64_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vslidedown.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
 ;
@@ -575,12 +575,12 @@ define i64 @extractelt_v4i64_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vadd.vv v8, v8, v8
 ; RV32-NEXT:    vslidedown.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vsrl.vx v8, v8, a1
+; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
 ;
@@ -686,14 +686,14 @@ define i64 @extractelt_v3i64_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 3, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    add a1, a1, a1
+; RV32-NEXT:    addi a0, a1, 1
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vadd.vv v8, v8, v8
-; RV32-NEXT:    add a1, a1, a1
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vx v10, v8, a1
+; RV32-NEXT:    vslidedown.vx v8, v8, a0
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    addi a1, a1, 1
-; RV32-NEXT:    vslidedown.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
 ;
@@ -827,19 +827,19 @@ define i32 @extractelt_v64i32_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; RV32-NEXT:    addi s0, sp, 384
 ; RV32-NEXT:    andi sp, sp, -128
 ; RV32-NEXT:    andi a1, a1, 63
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    li a2, 32
-; RV32-NEXT:    addi a3, a0, 128
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v8, (a3)
+; RV32-NEXT:    mv a2, sp
+; RV32-NEXT:    li a3, 32
+; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vle32.v v16, (a0)
-; RV32-NEXT:    mv a0, sp
-; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    vadd.vv v8, v8, v8
+; RV32-NEXT:    addi a0, sp, 128
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    vadd.vv v16, v16, v16
+; RV32-NEXT:    vadd.vv v8, v8, v8
+; RV32-NEXT:    vse32.v v8, (a2)
 ; RV32-NEXT:    vse32.v v16, (a0)
-; RV32-NEXT:    addi a0, sp, 128
-; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    lw a0, 0(a1)
 ; RV32-NEXT:    addi sp, s0, -384
 ; RV32-NEXT:    lw ra, 380(sp) # 4-byte Folded Reload
@@ -855,19 +855,19 @@ define i32 @extractelt_v64i32_idx(ptr %x, i32 zeroext %idx) nounwind {
 ; RV64-NEXT:    addi s0, sp, 384
 ; RV64-NEXT:    andi sp, sp, -128
 ; RV64-NEXT:    andi a1, a1, 63
-; RV64-NEXT:    slli a1, a1, 2
-; RV64-NEXT:    li a2, 32
-; RV64-NEXT:    addi a3, a0, 128
-; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV64-NEXT:    vle32.v v8, (a3)
+; RV64-NEXT:    mv a2, sp
+; RV64-NEXT:    li a3, 32
+; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; RV64-NEXT:    vle32.v v8, (a0)
+; RV64-NEXT:    addi a0, a0, 128
 ; RV64-NEXT:    vle32.v v16, (a0)
-; RV64-NEXT:    mv a0, sp
-; RV64-NEXT:    add a1, a0, a1
-; RV64-NEXT:    vadd.vv v8, v8, v8
+; RV64-NEXT:    addi a0, sp, 128
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    vadd.vv v16, v16, v16
+; RV64-NEXT:    vadd.vv v8, v8, v8
+; RV64-NEXT:    vse32.v v8, (a2)
 ; RV64-NEXT:    vse32.v v16, (a0)
-; RV64-NEXT:    addi a0, sp, 128
-; RV64-NEXT:    vse32.v v8, (a0)
 ; RV64-NEXT:    lw a0, 0(a1)
 ; RV64-NEXT:    addi sp, s0, -384
 ; RV64-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
@@ -931,14 +931,14 @@ define void @store_extractelt_v2i64(ptr %x, ptr %p) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    vmv.x.s a2, v8
-; RV32-NEXT:    sw a2, 0(a1)
-; RV32-NEXT:    sw a0, 4(a1)
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vmv.x.s a2, v9
+; RV32-NEXT:    sw a0, 0(a1)
+; RV32-NEXT:    sw a2, 4(a1)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: store_extractelt_v2i64:
@@ -1062,17 +1062,17 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) {
 ; RV32NOM-NEXT:    addi a0, a0, %lo(.LCPI46_0)
 ; RV32NOM-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32NOM-NEXT:    vle32.v v9, (a0)
-; RV32NOM-NEXT:    vmulh.vv v9, v8, v9
 ; RV32NOM-NEXT:    lui a0, 1044480
 ; RV32NOM-NEXT:    vmv.s.x v10, a0
-; RV32NOM-NEXT:    vsext.vf4 v11, v10
-; RV32NOM-NEXT:    vand.vv v8, v8, v11
-; RV32NOM-NEXT:    vadd.vv v8, v9, v8
 ; RV32NOM-NEXT:    lui a0, 12320
 ; RV32NOM-NEXT:    addi a0, a0, 257
+; RV32NOM-NEXT:    vsext.vf4 v11, v10
+; RV32NOM-NEXT:    vand.vv v10, v8, v11
+; RV32NOM-NEXT:    vmulh.vv v8, v8, v9
 ; RV32NOM-NEXT:    vmv.s.x v9, a0
-; RV32NOM-NEXT:    vsext.vf4 v10, v9
-; RV32NOM-NEXT:    vsra.vv v9, v8, v10
+; RV32NOM-NEXT:    vsext.vf4 v11, v9
+; RV32NOM-NEXT:    vadd.vv v8, v8, v10
+; RV32NOM-NEXT:    vsra.vv v9, v8, v11
 ; RV32NOM-NEXT:    vsrl.vi v8, v8, 31
 ; RV32NOM-NEXT:    vadd.vv v8, v9, v8
 ; RV32NOM-NEXT:    vslidedown.vi v8, v8, 2
@@ -1083,10 +1083,10 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) {
 ; RV32M:       # %bb.0:
 ; RV32M-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32M-NEXT:    vslidedown.vi v8, v8, 2
-; RV32M-NEXT:    vmv.x.s a0, v8
-; RV32M-NEXT:    lui a1, 322639
-; RV32M-NEXT:    addi a1, a1, -945
-; RV32M-NEXT:    mulh a0, a0, a1
+; RV32M-NEXT:    lui a0, 322639
+; RV32M-NEXT:    vmv.x.s a1, v8
+; RV32M-NEXT:    addi a0, a0, -945
+; RV32M-NEXT:    mulh a0, a1, a0
 ; RV32M-NEXT:    srli a1, a0, 31
 ; RV32M-NEXT:    srai a0, a0, 2
 ; RV32M-NEXT:    add a0, a0, a1
@@ -1098,15 +1098,15 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) {
 ; RV64NOM-NEXT:    addi a0, a0, %lo(.LCPI46_0)
 ; RV64NOM-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64NOM-NEXT:    vle32.v v9, (a0)
-; RV64NOM-NEXT:    vmulh.vv v9, v8, v9
 ; RV64NOM-NEXT:    lui a0, 1044480
 ; RV64NOM-NEXT:    vmv.s.x v10, a0
-; RV64NOM-NEXT:    vsext.vf4 v11, v10
-; RV64NOM-NEXT:    vand.vv v8, v8, v11
-; RV64NOM-NEXT:    vadd.vv v8, v9, v8
 ; RV64NOM-NEXT:    lui a0, 12320
 ; RV64NOM-NEXT:    addi a0, a0, 257
+; RV64NOM-NEXT:    vsext.vf4 v11, v10
+; RV64NOM-NEXT:    vand.vv v10, v8, v11
+; RV64NOM-NEXT:    vmulh.vv v8, v8, v9
 ; RV64NOM-NEXT:    vmv.s.x v9, a0
+; RV64NOM-NEXT:    vadd.vv v8, v8, v10
 ; RV64NOM-NEXT:    vsext.vf4 v10, v9
 ; RV64NOM-NEXT:    vsra.vv v8, v8, v10
 ; RV64NOM-NEXT:    vsrl.vi v9, v8, 31
@@ -1119,10 +1119,10 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) {
 ; RV64M:       # %bb.0:
 ; RV64M-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64M-NEXT:    vslidedown.vi v8, v8, 2
-; RV64M-NEXT:    vmv.x.s a0, v8
-; RV64M-NEXT:    lui a1, 322639
-; RV64M-NEXT:    addiw a1, a1, -945
-; RV64M-NEXT:    mul a0, a0, a1
+; RV64M-NEXT:    lui a0, 322639
+; RV64M-NEXT:    vmv.x.s a1, v8
+; RV64M-NEXT:    addiw a0, a0, -945
+; RV64M-NEXT:    mul a0, a1, a0
 ; RV64M-NEXT:    srli a1, a0, 63
 ; RV64M-NEXT:    srai a0, a0, 34
 ; RV64M-NEXT:    add a0, a0, a1
@@ -1149,10 +1149,10 @@ define i32 @extractelt_udiv_v4i32(<4 x i32> %x) {
 ; RV32M:       # %bb.0:
 ; RV32M-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32M-NEXT:    vslidedown.vi v8, v8, 2
-; RV32M-NEXT:    vmv.x.s a0, v8
-; RV32M-NEXT:    lui a1, 322639
-; RV32M-NEXT:    addi a1, a1, -945
-; RV32M-NEXT:    mulhu a0, a0, a1
+; RV32M-NEXT:    lui a0, 322639
+; RV32M-NEXT:    vmv.x.s a1, v8
+; RV32M-NEXT:    addi a0, a0, -945
+; RV32M-NEXT:    mulhu a0, a1, a0
 ; RV32M-NEXT:    srli a0, a0, 2
 ; RV32M-NEXT:    ret
 ;
@@ -1172,11 +1172,11 @@ define i32 @extractelt_udiv_v4i32(<4 x i32> %x) {
 ; RV64M-LABEL: extractelt_udiv_v4i32:
 ; RV64M:       # %bb.0:
 ; RV64M-NEXT:    lui a0, 322639
-; RV64M-NEXT:    addi a0, a0, -945
-; RV64M-NEXT:    slli a0, a0, 32
 ; RV64M-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64M-NEXT:    vslidedown.vi v8, v8, 2
+; RV64M-NEXT:    addi a0, a0, -945
 ; RV64M-NEXT:    vmv.x.s a1, v8
+; RV64M-NEXT:    slli a0, a0, 32
 ; RV64M-NEXT:    slli a1, a1, 32
 ; RV64M-NEXT:    mulhu a0, a1, a0
 ; RV64M-NEXT:    srli a0, a0, 34
@@ -1191,8 +1191,8 @@ define float @extractelt_fadd_v4f32(<4 x float> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    lui a0, 267520
+; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    fmv.w.x fa4, a0
 ; CHECK-NEXT:    fadd.s fa0, fa5, fa4
 ; CHECK-NEXT:    ret
@@ -1206,8 +1206,8 @@ define float @extractelt_fsub_v4f32(<4 x float> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    lui a0, 267520
+; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    fmv.w.x fa4, a0
 ; CHECK-NEXT:    fsub.s fa0, fa4, fa5
 ; CHECK-NEXT:    ret
@@ -1221,8 +1221,8 @@ define float @extractelt_fmul_v4f32(<4 x float> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    lui a0, 267520
+; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    fmv.w.x fa4, a0
 ; CHECK-NEXT:    fmul.s fa0, fa5, fa4
 ; CHECK-NEXT:    ret
@@ -1236,8 +1236,8 @@ define float @extractelt_fdiv_v4f32(<4 x float> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    lui a0, 267520
+; CHECK-NEXT:    vfmv.f.s fa5, v8
 ; CHECK-NEXT:    fmv.w.x fa4, a0
 ; CHECK-NEXT:    fdiv.s fa0, fa5, fa4
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
index 84895715e814f9..ab2d00b9b9137c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll
@@ -123,10 +123,10 @@ define <32 x half> @ceil_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: ceil_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI5_0)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -148,10 +148,10 @@ define <1 x float> @ceil_v1f32(<1 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -171,10 +171,10 @@ define <2 x float> @ceil_v2f32(<2 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -194,10 +194,10 @@ define <4 x float> @ceil_v4f32(<4 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -217,10 +217,10 @@ define <8 x float> @ceil_v8f32(<8 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -240,10 +240,10 @@ define <16 x float> @ceil_v16f32(<16 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
index 3c99870dba950c..c6ce7c1bbe8b4a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll
@@ -123,10 +123,10 @@ define <32 x half> @floor_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: floor_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI5_0)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -148,10 +148,10 @@ define <1 x float> @floor_v1f32(<1 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -171,10 +171,10 @@ define <2 x float> @floor_v2f32(<2 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -194,10 +194,10 @@ define <4 x float> @floor_v4f32(<4 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -217,10 +217,10 @@ define <8 x float> @floor_v8f32(<8 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -240,10 +240,10 @@ define <16 x float> @floor_v16f32(<16 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
index f86fc509983534..02e99ea513e69b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
@@ -32,10 +32,10 @@ define <2 x half> @vp_floor_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -72,10 +72,10 @@ define <2 x half> @vp_floor_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -114,10 +114,10 @@ define <4 x half> @vp_floor_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -154,10 +154,10 @@ define <4 x half> @vp_floor_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -197,10 +197,10 @@ define <8 x half> @vp_floor_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -238,10 +238,10 @@ define <8 x half> @vp_floor_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -261,9 +261,9 @@ declare <16 x half> @llvm.vp.floor.v16f16(<16 x half>, <16 x i1>, i32)
 define <16 x half> @vp_floor_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_v16f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -283,10 +283,10 @@ define <16 x half> @vp_floor_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -324,10 +324,10 @@ define <16 x half> @vp_floor_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -561,9 +561,9 @@ declare <4 x double> @llvm.vp.floor.v4f64(<4 x double>, <4 x i1>, i32)
 define <4 x double> @vp_floor_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v4f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a1)
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -605,9 +605,9 @@ declare <8 x double> @llvm.vp.floor.v8f64(<8 x double>, <8 x i1>, i32)
 define <8 x double> @vp_floor_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v8f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a1)
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
@@ -649,9 +649,9 @@ declare <15 x double> @llvm.vp.floor.v15f64(<15 x double>, <15 x i1>, i32)
 define <15 x double> @vp_floor_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v15f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -693,9 +693,9 @@ declare <16 x double> @llvm.vp.floor.v16f64(<16 x double>, <16 x i1>, i32)
 define <16 x double> @vp_floor_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v16f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -737,59 +737,69 @@ declare <32 x double> @llvm.vp.floor.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v6, v0
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a1, 2
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -808,27 +818,30 @@ define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB27_2:
-; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
+; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
+; CHECK-NEXT:    addi a2, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a2
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    fsrmi a2, 2
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a1, 2
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrm a2
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
index cefb246f3821a3..9a3838d57a0b07 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
@@ -63,11 +63,9 @@ define <2 x half> @vfmax_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -135,11 +133,9 @@ define <4 x half> @vfmax_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
@@ -209,11 +205,9 @@ define <8 x half> @vfmax_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v14, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -285,11 +279,9 @@ define <16 x half> @vfmax_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v20, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -610,56 +602,72 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    li a3, 24
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v6, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v16, (a0)
 ; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v7, v0, 2
-; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    bltu a2, a1, .LBB24_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB24_2:
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vmfeq.vv v26, v16, v16, v0.t
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v26
-; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmax.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -669,7 +677,7 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -677,8 +685,7 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -688,12 +695,13 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v16, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -718,18 +726,21 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vle64.v v24, (a0)
 ; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    bltu a2, a1, .LBB25_2
 ; CHECK-NEXT:  # %bb.1:
@@ -738,36 +749,25 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmv8r.v v16, v24
-; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a0, a2, -16
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -776,10 +776,7 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
 ; CHECK-NEXT:    vfmax.vv v16, v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
index ae592119cf8815..900e02876cbe1e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
@@ -315,21 +315,19 @@ define <2 x half> @vfmax_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v9, v9
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v11, v9, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v10, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
-; ZVFHMIN-NEXT:    vfmax.vv v9, v10, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v11, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -356,14 +354,12 @@ define <2 x half> @vfmax_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
index 35bd8b7c17e6de..4a7f888fbced4f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
@@ -63,11 +63,9 @@ define <2 x half> @vfmin_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -135,11 +133,9 @@ define <4 x half> @vfmin_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
@@ -209,11 +205,9 @@ define <8 x half> @vfmin_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v14, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -285,11 +279,9 @@ define <16 x half> @vfmin_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v20, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -610,56 +602,72 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    li a3, 24
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v6, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v16, (a0)
 ; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v7, v0, 2
-; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    bltu a2, a1, .LBB24_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB24_2:
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vmfeq.vv v26, v16, v16, v0.t
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v26
-; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmin.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -669,7 +677,7 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -677,8 +685,7 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -688,12 +695,13 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v16, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -718,18 +726,21 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vle64.v v24, (a0)
 ; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    bltu a2, a1, .LBB25_2
 ; CHECK-NEXT:  # %bb.1:
@@ -738,36 +749,25 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmv8r.v v16, v24
-; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a0, a2, -16
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -776,10 +776,7 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double>
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
 ; CHECK-NEXT:    vfmin.vv v16, v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
index 8e042fc0785e19..db970c89d935c5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
@@ -315,21 +315,19 @@ define <2 x half> @vfmin_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v9, v9
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v11, v9, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v10, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
-; ZVFHMIN-NEXT:    vfmin.vv v9, v10, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v11, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -356,14 +354,12 @@ define <2 x half> @vfmin_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
index 0b9fabb832e296..3a7ded1537ef66 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll
@@ -106,10 +106,10 @@ define <32 x half> @nearbyint_v32f16(<32 x half> %v) strictfp {
 ; CHECK-LABEL: nearbyint_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI4_0)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    lui a0, %hi(.LCPI4_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI4_0)(a1)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -132,10 +132,10 @@ define <2 x float> @nearbyint_v2f32(<2 x float> %v) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -156,10 +156,10 @@ define <4 x float> @nearbyint_v4f32(<4 x float> %v) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -180,10 +180,10 @@ define <8 x float> @nearbyint_v8f32(<8 x float> %v) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -204,10 +204,10 @@ define <16 x float> @nearbyint_v16f32(<16 x float> %v) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index 10de74824548c1..b0f8bc9dcc6bd5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -42,11 +42,11 @@ define <4 x float> @hang_when_merging_stores_after_legalization(<8 x float> %x,
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vid.v v12
 ; CHECK-NEXT:    li a0, 7
+; CHECK-NEXT:    vmv.v.i v0, 12
 ; CHECK-NEXT:    vmul.vx v14, v12, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v14
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v0, 12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vadd.vi v8, v14, -14
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vrgatherei16.vv v12, v10, v8, v0.t
@@ -252,8 +252,8 @@ define dso_local void @splat_load_licm(ptr %0) {
 ; RV32-LABEL: splat_load_licm:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    lui a1, 1
-; RV32-NEXT:    add a1, a0, a1
 ; RV32-NEXT:    lui a2, 263168
+; RV32-NEXT:    add a1, a0, a1
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a2
 ; RV32-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
@@ -266,8 +266,8 @@ define dso_local void @splat_load_licm(ptr %0) {
 ; RV64V-LABEL: splat_load_licm:
 ; RV64V:       # %bb.0:
 ; RV64V-NEXT:    lui a1, 1
-; RV64V-NEXT:    add a1, a0, a1
 ; RV64V-NEXT:    lui a2, 263168
+; RV64V-NEXT:    add a1, a0, a1
 ; RV64V-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64V-NEXT:    vmv.v.x v8, a2
 ; RV64V-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
@@ -280,8 +280,8 @@ define dso_local void @splat_load_licm(ptr %0) {
 ; RVA22U64-LABEL: splat_load_licm:
 ; RVA22U64:       # %bb.0:
 ; RVA22U64-NEXT:    lui a1, 1
-; RVA22U64-NEXT:    add a1, a1, a0
 ; RVA22U64-NEXT:    lui a2, 263168
+; RVA22U64-NEXT:    add a1, a1, a0
 ; RVA22U64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RVA22U64-NEXT:    vmv.v.x v8, a2
 ; RVA22U64-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
@@ -294,8 +294,8 @@ define dso_local void @splat_load_licm(ptr %0) {
 ; RV64ZVFHMIN-LABEL: splat_load_licm:
 ; RV64ZVFHMIN:       # %bb.0:
 ; RV64ZVFHMIN-NEXT:    lui a1, 1
-; RV64ZVFHMIN-NEXT:    add a1, a0, a1
 ; RV64ZVFHMIN-NEXT:    lui a2, 263168
+; RV64ZVFHMIN-NEXT:    add a1, a0, a1
 ; RV64ZVFHMIN-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64ZVFHMIN-NEXT:    vmv.v.x v8, a2
 ; RV64ZVFHMIN-NEXT:  .LBB12_1: # =>This Inner Loop Header: Depth=1
@@ -593,22 +593,6 @@ define <32 x float> @buildvec_v32f32(float %e0, float %e1, float %e2, float %e3,
 ; RV32-NEXT:    flw fs1, 52(s0)
 ; RV32-NEXT:    flw fs2, 56(s0)
 ; RV32-NEXT:    flw fs3, 60(s0)
-; RV32-NEXT:    fsw fs0, 112(sp)
-; RV32-NEXT:    fsw fs1, 116(sp)
-; RV32-NEXT:    fsw fs2, 120(sp)
-; RV32-NEXT:    fsw fs3, 124(sp)
-; RV32-NEXT:    fsw ft8, 96(sp)
-; RV32-NEXT:    fsw ft9, 100(sp)
-; RV32-NEXT:    fsw ft10, 104(sp)
-; RV32-NEXT:    fsw ft11, 108(sp)
-; RV32-NEXT:    fsw ft4, 80(sp)
-; RV32-NEXT:    fsw ft5, 84(sp)
-; RV32-NEXT:    fsw ft6, 88(sp)
-; RV32-NEXT:    fsw ft7, 92(sp)
-; RV32-NEXT:    fsw ft0, 64(sp)
-; RV32-NEXT:    fsw ft1, 68(sp)
-; RV32-NEXT:    fsw ft2, 72(sp)
-; RV32-NEXT:    fsw ft3, 76(sp)
 ; RV32-NEXT:    sw a4, 48(sp)
 ; RV32-NEXT:    sw a5, 52(sp)
 ; RV32-NEXT:    sw a6, 56(sp)
@@ -626,6 +610,22 @@ define <32 x float> @buildvec_v32f32(float %e0, float %e1, float %e2, float %e3,
 ; RV32-NEXT:    fsw fa2, 8(sp)
 ; RV32-NEXT:    fsw fa3, 12(sp)
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    fsw fs0, 112(sp)
+; RV32-NEXT:    fsw fs1, 116(sp)
+; RV32-NEXT:    fsw fs2, 120(sp)
+; RV32-NEXT:    fsw fs3, 124(sp)
+; RV32-NEXT:    fsw ft8, 96(sp)
+; RV32-NEXT:    fsw ft9, 100(sp)
+; RV32-NEXT:    fsw ft10, 104(sp)
+; RV32-NEXT:    fsw ft11, 108(sp)
+; RV32-NEXT:    fsw ft4, 80(sp)
+; RV32-NEXT:    fsw ft5, 84(sp)
+; RV32-NEXT:    fsw ft6, 88(sp)
+; RV32-NEXT:    fsw ft7, 92(sp)
+; RV32-NEXT:    fsw ft0, 64(sp)
+; RV32-NEXT:    fsw ft1, 68(sp)
+; RV32-NEXT:    fsw ft2, 72(sp)
+; RV32-NEXT:    fsw ft3, 76(sp)
 ; RV32-NEXT:    mv a1, sp
 ; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a1)
@@ -682,22 +682,22 @@ define <32 x float> @buildvec_v32f32(float %e0, float %e1, float %e2, float %e3,
 ; RV64-NEXT:    addi s0, sp, 256
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -128
-; RV64-NEXT:    fmv.w.x ft0, a0
-; RV64-NEXT:    fmv.w.x ft1, a1
-; RV64-NEXT:    fmv.w.x ft2, a2
-; RV64-NEXT:    fmv.w.x ft3, a3
-; RV64-NEXT:    fmv.w.x ft4, a4
-; RV64-NEXT:    fmv.w.x ft5, a5
-; RV64-NEXT:    fmv.w.x ft6, a6
-; RV64-NEXT:    fmv.w.x ft7, a7
-; RV64-NEXT:    flw ft8, 0(s0)
-; RV64-NEXT:    flw ft9, 8(s0)
-; RV64-NEXT:    flw ft10, 16(s0)
-; RV64-NEXT:    flw ft11, 24(s0)
-; RV64-NEXT:    flw fs0, 32(s0)
-; RV64-NEXT:    flw fs1, 40(s0)
-; RV64-NEXT:    flw fs2, 48(s0)
-; RV64-NEXT:    flw fs3, 56(s0)
+; RV64-NEXT:    fmv.w.x ft4, a0
+; RV64-NEXT:    fmv.w.x ft5, a1
+; RV64-NEXT:    fmv.w.x ft6, a2
+; RV64-NEXT:    fmv.w.x ft7, a3
+; RV64-NEXT:    fmv.w.x fs0, a4
+; RV64-NEXT:    fmv.w.x fs1, a5
+; RV64-NEXT:    fmv.w.x fs2, a6
+; RV64-NEXT:    fmv.w.x fs3, a7
+; RV64-NEXT:    flw ft0, 0(s0)
+; RV64-NEXT:    flw ft1, 8(s0)
+; RV64-NEXT:    flw ft2, 16(s0)
+; RV64-NEXT:    flw ft3, 24(s0)
+; RV64-NEXT:    flw ft8, 32(s0)
+; RV64-NEXT:    flw ft9, 40(s0)
+; RV64-NEXT:    flw ft10, 48(s0)
+; RV64-NEXT:    flw ft11, 56(s0)
 ; RV64-NEXT:    flw fs4, 64(s0)
 ; RV64-NEXT:    flw fs5, 72(s0)
 ; RV64-NEXT:    flw fs6, 80(s0)
@@ -706,22 +706,6 @@ define <32 x float> @buildvec_v32f32(float %e0, float %e1, float %e2, float %e3,
 ; RV64-NEXT:    flw fs9, 104(s0)
 ; RV64-NEXT:    flw fs10, 112(s0)
 ; RV64-NEXT:    flw fs11, 120(s0)
-; RV64-NEXT:    fsw fs8, 112(sp)
-; RV64-NEXT:    fsw fs9, 116(sp)
-; RV64-NEXT:    fsw fs10, 120(sp)
-; RV64-NEXT:    fsw fs11, 124(sp)
-; RV64-NEXT:    fsw fs4, 96(sp)
-; RV64-NEXT:    fsw fs5, 100(sp)
-; RV64-NEXT:    fsw fs6, 104(sp)
-; RV64-NEXT:    fsw fs7, 108(sp)
-; RV64-NEXT:    fsw fs0, 80(sp)
-; RV64-NEXT:    fsw fs1, 84(sp)
-; RV64-NEXT:    fsw fs2, 88(sp)
-; RV64-NEXT:    fsw fs3, 92(sp)
-; RV64-NEXT:    fsw ft8, 64(sp)
-; RV64-NEXT:    fsw ft9, 68(sp)
-; RV64-NEXT:    fsw ft10, 72(sp)
-; RV64-NEXT:    fsw ft11, 76(sp)
 ; RV64-NEXT:    fsw fa4, 16(sp)
 ; RV64-NEXT:    fsw fa5, 20(sp)
 ; RV64-NEXT:    fsw fa6, 24(sp)
@@ -730,15 +714,31 @@ define <32 x float> @buildvec_v32f32(float %e0, float %e1, float %e2, float %e3,
 ; RV64-NEXT:    fsw fa1, 4(sp)
 ; RV64-NEXT:    fsw fa2, 8(sp)
 ; RV64-NEXT:    fsw fa3, 12(sp)
-; RV64-NEXT:    fsw ft4, 48(sp)
-; RV64-NEXT:    fsw ft5, 52(sp)
-; RV64-NEXT:    fsw ft6, 56(sp)
-; RV64-NEXT:    fsw ft7, 60(sp)
-; RV64-NEXT:    fsw ft0, 32(sp)
-; RV64-NEXT:    fsw ft1, 36(sp)
-; RV64-NEXT:    fsw ft2, 40(sp)
-; RV64-NEXT:    fsw ft3, 44(sp)
 ; RV64-NEXT:    li a0, 32
+; RV64-NEXT:    fsw fs0, 48(sp)
+; RV64-NEXT:    fsw fs1, 52(sp)
+; RV64-NEXT:    fsw fs2, 56(sp)
+; RV64-NEXT:    fsw fs3, 60(sp)
+; RV64-NEXT:    fsw ft4, 32(sp)
+; RV64-NEXT:    fsw ft5, 36(sp)
+; RV64-NEXT:    fsw ft6, 40(sp)
+; RV64-NEXT:    fsw ft7, 44(sp)
+; RV64-NEXT:    fsw fs8, 112(sp)
+; RV64-NEXT:    fsw fs9, 116(sp)
+; RV64-NEXT:    fsw fs10, 120(sp)
+; RV64-NEXT:    fsw fs11, 124(sp)
+; RV64-NEXT:    fsw fs4, 96(sp)
+; RV64-NEXT:    fsw fs5, 100(sp)
+; RV64-NEXT:    fsw fs6, 104(sp)
+; RV64-NEXT:    fsw fs7, 108(sp)
+; RV64-NEXT:    fsw ft8, 80(sp)
+; RV64-NEXT:    fsw ft9, 84(sp)
+; RV64-NEXT:    fsw ft10, 88(sp)
+; RV64-NEXT:    fsw ft11, 92(sp)
+; RV64-NEXT:    fsw ft0, 64(sp)
+; RV64-NEXT:    fsw ft1, 68(sp)
+; RV64-NEXT:    fsw ft2, 72(sp)
+; RV64-NEXT:    fsw ft3, 76(sp)
 ; RV64-NEXT:    mv a1, sp
 ; RV64-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a1)
@@ -899,24 +899,24 @@ define <16 x double> @buildvec_v16f64(double %e0, double %e1, double %e2, double
 ; RV32-NEXT:    andi sp, sp, -128
 ; RV32-NEXT:    sw a0, 120(sp)
 ; RV32-NEXT:    sw a1, 124(sp)
-; RV32-NEXT:    fld ft0, 120(sp)
+; RV32-NEXT:    fld ft0, 0(s0)
+; RV32-NEXT:    fld ft1, 8(s0)
+; RV32-NEXT:    fld ft2, 16(s0)
+; RV32-NEXT:    fld ft3, 24(s0)
+; RV32-NEXT:    fld ft4, 120(sp)
 ; RV32-NEXT:    sw a2, 120(sp)
 ; RV32-NEXT:    sw a3, 124(sp)
-; RV32-NEXT:    fld ft1, 120(sp)
+; RV32-NEXT:    fld ft5, 120(sp)
 ; RV32-NEXT:    sw a4, 120(sp)
 ; RV32-NEXT:    sw a5, 124(sp)
-; RV32-NEXT:    fld ft2, 120(sp)
+; RV32-NEXT:    fld ft6, 120(sp)
 ; RV32-NEXT:    sw a6, 120(sp)
 ; RV32-NEXT:    sw a7, 124(sp)
-; RV32-NEXT:    fld ft3, 120(sp)
-; RV32-NEXT:    fld ft4, 0(s0)
-; RV32-NEXT:    fld ft5, 8(s0)
-; RV32-NEXT:    fld ft6, 16(s0)
-; RV32-NEXT:    fld ft7, 24(s0)
-; RV32-NEXT:    fsd ft4, 224(sp)
-; RV32-NEXT:    fsd ft5, 232(sp)
-; RV32-NEXT:    fsd ft6, 240(sp)
-; RV32-NEXT:    fsd ft7, 248(sp)
+; RV32-NEXT:    fld ft7, 120(sp)
+; RV32-NEXT:    fsd ft0, 224(sp)
+; RV32-NEXT:    fsd ft1, 232(sp)
+; RV32-NEXT:    fsd ft2, 240(sp)
+; RV32-NEXT:    fsd ft3, 248(sp)
 ; RV32-NEXT:    fsd fa4, 160(sp)
 ; RV32-NEXT:    fsd fa5, 168(sp)
 ; RV32-NEXT:    fsd fa6, 176(sp)
@@ -925,10 +925,10 @@ define <16 x double> @buildvec_v16f64(double %e0, double %e1, double %e2, double
 ; RV32-NEXT:    fsd fa1, 136(sp)
 ; RV32-NEXT:    fsd fa2, 144(sp)
 ; RV32-NEXT:    fsd fa3, 152(sp)
-; RV32-NEXT:    fsd ft0, 192(sp)
-; RV32-NEXT:    fsd ft1, 200(sp)
-; RV32-NEXT:    fsd ft2, 208(sp)
-; RV32-NEXT:    fsd ft3, 216(sp)
+; RV32-NEXT:    fsd ft4, 192(sp)
+; RV32-NEXT:    fsd ft5, 200(sp)
+; RV32-NEXT:    fsd ft6, 208(sp)
+; RV32-NEXT:    fsd ft7, 216(sp)
 ; RV32-NEXT:    addi a0, sp, 128
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
@@ -1038,56 +1038,58 @@ define <32 x double> @buildvec_v32f64(double %e0, double %e1, double %e2, double
 ; RV32-NEXT:    andi sp, sp, -128
 ; RV32-NEXT:    sw a0, 120(sp)
 ; RV32-NEXT:    sw a1, 124(sp)
-; RV32-NEXT:    fld ft0, 120(sp)
+; RV32-NEXT:    fld ft0, 0(s0)
+; RV32-NEXT:    fld ft1, 8(s0)
+; RV32-NEXT:    fld ft2, 16(s0)
+; RV32-NEXT:    fld ft3, 24(s0)
+; RV32-NEXT:    fld ft4, 32(s0)
+; RV32-NEXT:    fld ft5, 40(s0)
+; RV32-NEXT:    fld ft6, 48(s0)
+; RV32-NEXT:    fld ft7, 56(s0)
+; RV32-NEXT:    fld ft8, 64(s0)
+; RV32-NEXT:    fld ft9, 72(s0)
+; RV32-NEXT:    fld ft10, 80(s0)
+; RV32-NEXT:    fld ft11, 88(s0)
+; RV32-NEXT:    fld fs0, 96(s0)
+; RV32-NEXT:    fld fs1, 104(s0)
+; RV32-NEXT:    fld fs2, 112(s0)
+; RV32-NEXT:    fld fs3, 120(s0)
+; RV32-NEXT:    fld fs4, 128(s0)
+; RV32-NEXT:    fld fs5, 136(s0)
+; RV32-NEXT:    fld fs6, 144(s0)
+; RV32-NEXT:    fld fs7, 152(s0)
+; RV32-NEXT:    addi a0, sp, 128
+; RV32-NEXT:    addi a1, sp, 256
+; RV32-NEXT:    fld fs8, 120(sp)
 ; RV32-NEXT:    sw a2, 120(sp)
 ; RV32-NEXT:    sw a3, 124(sp)
-; RV32-NEXT:    fld ft1, 120(sp)
+; RV32-NEXT:    fld fs9, 120(sp)
 ; RV32-NEXT:    sw a4, 120(sp)
 ; RV32-NEXT:    sw a5, 124(sp)
-; RV32-NEXT:    fld ft2, 120(sp)
+; RV32-NEXT:    fld fs10, 120(sp)
 ; RV32-NEXT:    sw a6, 120(sp)
 ; RV32-NEXT:    sw a7, 124(sp)
-; RV32-NEXT:    fld ft3, 120(sp)
-; RV32-NEXT:    fld ft4, 0(s0)
-; RV32-NEXT:    fld ft5, 8(s0)
-; RV32-NEXT:    fld ft6, 16(s0)
-; RV32-NEXT:    fld ft7, 24(s0)
-; RV32-NEXT:    fld ft8, 32(s0)
-; RV32-NEXT:    fld ft9, 40(s0)
-; RV32-NEXT:    fld ft10, 48(s0)
-; RV32-NEXT:    fld ft11, 56(s0)
-; RV32-NEXT:    fld fs0, 64(s0)
-; RV32-NEXT:    fld fs1, 72(s0)
-; RV32-NEXT:    fld fs2, 80(s0)
-; RV32-NEXT:    fld fs3, 88(s0)
-; RV32-NEXT:    fld fs4, 96(s0)
-; RV32-NEXT:    fld fs5, 104(s0)
-; RV32-NEXT:    fld fs6, 112(s0)
-; RV32-NEXT:    fld fs7, 120(s0)
-; RV32-NEXT:    fld fs8, 128(s0)
-; RV32-NEXT:    fld fs9, 136(s0)
-; RV32-NEXT:    fld fs10, 144(s0)
-; RV32-NEXT:    fld fs11, 152(s0)
-; RV32-NEXT:    fsd fs8, 224(sp)
-; RV32-NEXT:    fsd fs9, 232(sp)
-; RV32-NEXT:    fsd fs10, 240(sp)
-; RV32-NEXT:    fsd fs11, 248(sp)
-; RV32-NEXT:    fsd fs4, 192(sp)
-; RV32-NEXT:    fsd fs5, 200(sp)
-; RV32-NEXT:    fsd fs6, 208(sp)
-; RV32-NEXT:    fsd fs7, 216(sp)
-; RV32-NEXT:    fsd fs0, 160(sp)
-; RV32-NEXT:    fsd fs1, 168(sp)
-; RV32-NEXT:    fsd fs2, 176(sp)
-; RV32-NEXT:    fsd fs3, 184(sp)
-; RV32-NEXT:    fsd ft8, 128(sp)
-; RV32-NEXT:    fsd ft9, 136(sp)
-; RV32-NEXT:    fsd ft10, 144(sp)
-; RV32-NEXT:    fsd ft11, 152(sp)
-; RV32-NEXT:    fsd ft4, 352(sp)
-; RV32-NEXT:    fsd ft5, 360(sp)
-; RV32-NEXT:    fsd ft6, 368(sp)
-; RV32-NEXT:    fsd ft7, 376(sp)
+; RV32-NEXT:    fld fs11, 120(sp)
+; RV32-NEXT:    fsd fs4, 224(sp)
+; RV32-NEXT:    fsd fs5, 232(sp)
+; RV32-NEXT:    fsd fs6, 240(sp)
+; RV32-NEXT:    fsd fs7, 248(sp)
+; RV32-NEXT:    fsd fs0, 192(sp)
+; RV32-NEXT:    fsd fs1, 200(sp)
+; RV32-NEXT:    fsd fs2, 208(sp)
+; RV32-NEXT:    fsd fs3, 216(sp)
+; RV32-NEXT:    fsd ft8, 160(sp)
+; RV32-NEXT:    fsd ft9, 168(sp)
+; RV32-NEXT:    fsd ft10, 176(sp)
+; RV32-NEXT:    fsd ft11, 184(sp)
+; RV32-NEXT:    fsd ft4, 128(sp)
+; RV32-NEXT:    fsd ft5, 136(sp)
+; RV32-NEXT:    fsd ft6, 144(sp)
+; RV32-NEXT:    fsd ft7, 152(sp)
+; RV32-NEXT:    fsd ft0, 352(sp)
+; RV32-NEXT:    fsd ft1, 360(sp)
+; RV32-NEXT:    fsd ft2, 368(sp)
+; RV32-NEXT:    fsd ft3, 376(sp)
 ; RV32-NEXT:    fsd fa4, 288(sp)
 ; RV32-NEXT:    fsd fa5, 296(sp)
 ; RV32-NEXT:    fsd fa6, 304(sp)
@@ -1096,15 +1098,13 @@ define <32 x double> @buildvec_v32f64(double %e0, double %e1, double %e2, double
 ; RV32-NEXT:    fsd fa1, 264(sp)
 ; RV32-NEXT:    fsd fa2, 272(sp)
 ; RV32-NEXT:    fsd fa3, 280(sp)
-; RV32-NEXT:    fsd ft0, 320(sp)
-; RV32-NEXT:    fsd ft1, 328(sp)
-; RV32-NEXT:    fsd ft2, 336(sp)
-; RV32-NEXT:    fsd ft3, 344(sp)
-; RV32-NEXT:    addi a0, sp, 128
+; RV32-NEXT:    fsd fs8, 320(sp)
+; RV32-NEXT:    fsd fs9, 328(sp)
+; RV32-NEXT:    fsd fs10, 336(sp)
+; RV32-NEXT:    fsd fs11, 344(sp)
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v16, (a0)
-; RV32-NEXT:    addi a0, sp, 256
-; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    vle64.v v8, (a1)
 ; RV32-NEXT:    addi sp, s0, -512
 ; RV32-NEXT:    .cfi_def_cfa sp, 512
 ; RV32-NEXT:    lw ra, 508(sp) # 4-byte Folded Reload
@@ -1190,6 +1190,8 @@ define <32 x double> @buildvec_v32f64(double %e0, double %e1, double %e2, double
 ; RV64-NEXT:    fsd fa1, 136(sp)
 ; RV64-NEXT:    fsd fa2, 144(sp)
 ; RV64-NEXT:    fsd fa3, 152(sp)
+; RV64-NEXT:    addi a0, sp, 128
+; RV64-NEXT:    mv a1, sp
 ; RV64-NEXT:    fsd fs0, 96(sp)
 ; RV64-NEXT:    fsd fs1, 104(sp)
 ; RV64-NEXT:    fsd fs2, 112(sp)
@@ -1206,11 +1208,9 @@ define <32 x double> @buildvec_v32f64(double %e0, double %e1, double %e2, double
 ; RV64-NEXT:    fsd ft1, 8(sp)
 ; RV64-NEXT:    fsd ft2, 16(sp)
 ; RV64-NEXT:    fsd ft3, 24(sp)
-; RV64-NEXT:    addi a0, sp, 128
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
-; RV64-NEXT:    mv a0, sp
-; RV64-NEXT:    vle64.v v16, (a0)
+; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    addi sp, s0, -384
 ; RV64-NEXT:    .cfi_def_cfa sp, 384
 ; RV64-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
@@ -1266,20 +1266,19 @@ define <32 x double> @buildvec_v32f64(double %e0, double %e1, double %e2, double
 define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double %e2, double %e3, double %e4, double %e5, double %e6, double %e7, double %e8, double %e9, double %e10, double %e11, double %e12, double %e13, double %e14, double %e15, double %e16, double %e17, double %e18, double %e19, double %e20, double %e21, double %e22, double %e23, double %e24, double %e25, double %e26, double %e27, double %e28, double %e29, double %e30, double %e31) vscale_range(2,2) {
 ; RV32-LABEL: buildvec_v32f64_exact_vlen:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -112
-; RV32-NEXT:    .cfi_def_cfa_offset 112
-; RV32-NEXT:    fsd fs0, 104(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs1, 96(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs2, 88(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs3, 80(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs4, 72(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs5, 64(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs6, 56(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs7, 48(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs8, 40(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs9, 32(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs10, 24(sp) # 8-byte Folded Spill
-; RV32-NEXT:    fsd fs11, 16(sp) # 8-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -96
+; RV32-NEXT:    .cfi_def_cfa_offset 96
+; RV32-NEXT:    fsd fs0, 88(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs1, 80(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs2, 72(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs3, 64(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs4, 56(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs5, 48(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs6, 40(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs7, 32(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs8, 24(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs9, 16(sp) # 8-byte Folded Spill
+; RV32-NEXT:    fsd fs10, 8(sp) # 8-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset fs0, -8
 ; RV32-NEXT:    .cfi_offset fs1, -16
 ; RV32-NEXT:    .cfi_offset fs2, -24
@@ -1291,84 +1290,82 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double
 ; RV32-NEXT:    .cfi_offset fs8, -72
 ; RV32-NEXT:    .cfi_offset fs9, -80
 ; RV32-NEXT:    .cfi_offset fs10, -88
-; RV32-NEXT:    .cfi_offset fs11, -96
-; RV32-NEXT:    sw a6, 8(sp)
-; RV32-NEXT:    sw a7, 12(sp)
-; RV32-NEXT:    fld ft6, 8(sp)
-; RV32-NEXT:    sw a4, 8(sp)
-; RV32-NEXT:    sw a5, 12(sp)
-; RV32-NEXT:    fld ft7, 8(sp)
-; RV32-NEXT:    sw a2, 8(sp)
-; RV32-NEXT:    sw a3, 12(sp)
-; RV32-NEXT:    fld ft8, 8(sp)
-; RV32-NEXT:    sw a0, 8(sp)
-; RV32-NEXT:    sw a1, 12(sp)
-; RV32-NEXT:    fld ft9, 8(sp)
-; RV32-NEXT:    fld ft0, 264(sp)
-; RV32-NEXT:    fld ft1, 256(sp)
-; RV32-NEXT:    fld ft2, 248(sp)
-; RV32-NEXT:    fld ft3, 240(sp)
-; RV32-NEXT:    fld ft4, 232(sp)
-; RV32-NEXT:    fld ft5, 224(sp)
-; RV32-NEXT:    fld ft10, 216(sp)
-; RV32-NEXT:    fld ft11, 208(sp)
-; RV32-NEXT:    fld fs0, 200(sp)
+; RV32-NEXT:    sw a6, 0(sp)
+; RV32-NEXT:    sw a7, 4(sp)
+; RV32-NEXT:    fld ft0, 248(sp)
+; RV32-NEXT:    fld ft1, 240(sp)
+; RV32-NEXT:    fld ft2, 232(sp)
+; RV32-NEXT:    fld ft3, 224(sp)
+; RV32-NEXT:    fld ft6, 216(sp)
+; RV32-NEXT:    fld ft8, 208(sp)
+; RV32-NEXT:    fld ft10, 200(sp)
 ; RV32-NEXT:    fld fs1, 192(sp)
-; RV32-NEXT:    fld fs2, 184(sp)
-; RV32-NEXT:    fld fs3, 176(sp)
-; RV32-NEXT:    fld fs4, 152(sp)
-; RV32-NEXT:    fld fs5, 144(sp)
-; RV32-NEXT:    fld fs6, 168(sp)
-; RV32-NEXT:    fld fs7, 160(sp)
-; RV32-NEXT:    fld fs8, 136(sp)
-; RV32-NEXT:    fld fs9, 128(sp)
-; RV32-NEXT:    fld fs10, 120(sp)
-; RV32-NEXT:    fld fs11, 112(sp)
+; RV32-NEXT:    fld ft11, 184(sp)
+; RV32-NEXT:    fld fs4, 176(sp)
+; RV32-NEXT:    fld fs2, 168(sp)
+; RV32-NEXT:    fld fs5, 160(sp)
+; RV32-NEXT:    fld fs3, 136(sp)
+; RV32-NEXT:    fld fs6, 128(sp)
+; RV32-NEXT:    fld fs7, 152(sp)
+; RV32-NEXT:    fld fs8, 144(sp)
+; RV32-NEXT:    fld ft4, 120(sp)
+; RV32-NEXT:    fld ft5, 112(sp)
+; RV32-NEXT:    fld ft7, 104(sp)
+; RV32-NEXT:    fld ft9, 96(sp)
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vfmv.v.f v8, ft9
-; RV32-NEXT:    vfslide1down.vf v12, v8, ft8
 ; RV32-NEXT:    vfmv.v.f v8, fa2
+; RV32-NEXT:    fld fa2, 0(sp)
+; RV32-NEXT:    sw a4, 0(sp)
+; RV32-NEXT:    sw a5, 4(sp)
+; RV32-NEXT:    fld fs0, 0(sp)
+; RV32-NEXT:    sw a2, 0(sp)
+; RV32-NEXT:    sw a3, 4(sp)
+; RV32-NEXT:    fld fs9, 0(sp)
+; RV32-NEXT:    sw a0, 0(sp)
+; RV32-NEXT:    sw a1, 4(sp)
+; RV32-NEXT:    fld fs10, 0(sp)
+; RV32-NEXT:    vfmv.v.f v9, fs8
+; RV32-NEXT:    vfmv.v.f v10, fs6
+; RV32-NEXT:    vfmv.v.f v11, fs5
+; RV32-NEXT:    vfmv.v.f v12, fs4
+; RV32-NEXT:    vfmv.v.f v13, fs1
+; RV32-NEXT:    vfslide1down.vf v17, v9, fs7
+; RV32-NEXT:    vfslide1down.vf v16, v10, fs3
+; RV32-NEXT:    vfslide1down.vf v18, v11, fs2
+; RV32-NEXT:    vfmv.v.f v9, fs10
+; RV32-NEXT:    vfslide1down.vf v19, v12, ft11
+; RV32-NEXT:    vfslide1down.vf v20, v13, ft10
+; RV32-NEXT:    vfslide1down.vf v12, v9, fs9
 ; RV32-NEXT:    vfslide1down.vf v9, v8, fa3
+; RV32-NEXT:    vfmv.v.f v8, ft8
+; RV32-NEXT:    vfslide1down.vf v21, v8, ft6
 ; RV32-NEXT:    vfmv.v.f v8, fa0
 ; RV32-NEXT:    vfslide1down.vf v8, v8, fa1
+; RV32-NEXT:    vfmv.v.f v10, ft3
+; RV32-NEXT:    vfslide1down.vf v22, v10, ft2
 ; RV32-NEXT:    vfmv.v.f v10, fa4
 ; RV32-NEXT:    vfslide1down.vf v10, v10, fa5
 ; RV32-NEXT:    vfmv.v.f v11, fa6
 ; RV32-NEXT:    vfslide1down.vf v11, v11, fa7
-; RV32-NEXT:    vfmv.v.f v13, ft7
-; RV32-NEXT:    vfslide1down.vf v13, v13, ft6
-; RV32-NEXT:    vfmv.v.f v14, fs11
-; RV32-NEXT:    vfslide1down.vf v14, v14, fs10
-; RV32-NEXT:    vfmv.v.f v15, fs9
-; RV32-NEXT:    vfslide1down.vf v15, v15, fs8
-; RV32-NEXT:    vfmv.v.f v16, fs7
-; RV32-NEXT:    vfslide1down.vf v17, v16, fs6
-; RV32-NEXT:    vfmv.v.f v16, fs5
-; RV32-NEXT:    vfslide1down.vf v16, v16, fs4
-; RV32-NEXT:    vfmv.v.f v18, fs3
-; RV32-NEXT:    vfslide1down.vf v18, v18, fs2
-; RV32-NEXT:    vfmv.v.f v19, fs1
-; RV32-NEXT:    vfslide1down.vf v19, v19, fs0
-; RV32-NEXT:    vfmv.v.f v20, ft11
-; RV32-NEXT:    vfslide1down.vf v20, v20, ft10
-; RV32-NEXT:    vfmv.v.f v21, ft5
-; RV32-NEXT:    vfslide1down.vf v21, v21, ft4
-; RV32-NEXT:    vfmv.v.f v22, ft3
-; RV32-NEXT:    vfslide1down.vf v22, v22, ft2
+; RV32-NEXT:    vfmv.v.f v13, fs0
+; RV32-NEXT:    vfslide1down.vf v13, v13, fa2
+; RV32-NEXT:    vfmv.v.f v14, ft9
+; RV32-NEXT:    vfslide1down.vf v14, v14, ft7
+; RV32-NEXT:    vfmv.v.f v15, ft5
+; RV32-NEXT:    vfslide1down.vf v15, v15, ft4
 ; RV32-NEXT:    vfmv.v.f v23, ft1
 ; RV32-NEXT:    vfslide1down.vf v23, v23, ft0
-; RV32-NEXT:    fld fs0, 104(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs1, 96(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs2, 88(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs3, 80(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs4, 72(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs5, 64(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs6, 56(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs7, 48(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs8, 40(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs9, 32(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs10, 24(sp) # 8-byte Folded Reload
-; RV32-NEXT:    fld fs11, 16(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs0, 88(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs1, 80(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs2, 72(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs3, 64(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs4, 56(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs5, 48(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs6, 40(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs7, 32(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs8, 24(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs9, 16(sp) # 8-byte Folded Reload
+; RV32-NEXT:    fld fs10, 8(sp) # 8-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore fs0
 ; RV32-NEXT:    .cfi_restore fs1
 ; RV32-NEXT:    .cfi_restore fs2
@@ -1380,8 +1377,7 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double
 ; RV32-NEXT:    .cfi_restore fs8
 ; RV32-NEXT:    .cfi_restore fs9
 ; RV32-NEXT:    .cfi_restore fs10
-; RV32-NEXT:    .cfi_restore fs11
-; RV32-NEXT:    addi sp, sp, 112
+; RV32-NEXT:    addi sp, sp, 96
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
@@ -1405,18 +1401,18 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double
 ; RV64-NEXT:    .cfi_offset fs5, -48
 ; RV64-NEXT:    .cfi_offset fs6, -56
 ; RV64-NEXT:    .cfi_offset fs7, -64
-; RV64-NEXT:    fmv.d.x ft4, a7
-; RV64-NEXT:    fmv.d.x ft5, a5
-; RV64-NEXT:    fmv.d.x ft6, a3
-; RV64-NEXT:    fmv.d.x ft7, a1
+; RV64-NEXT:    fmv.d.x ft6, a7
+; RV64-NEXT:    fmv.d.x ft9, a5
+; RV64-NEXT:    fmv.d.x ft10, a3
+; RV64-NEXT:    fmv.d.x ft11, a1
 ; RV64-NEXT:    fld ft0, 184(sp)
 ; RV64-NEXT:    fld ft1, 176(sp)
 ; RV64-NEXT:    fld ft2, 168(sp)
 ; RV64-NEXT:    fld ft3, 160(sp)
-; RV64-NEXT:    fld ft8, 152(sp)
-; RV64-NEXT:    fld ft9, 144(sp)
-; RV64-NEXT:    fld ft10, 136(sp)
-; RV64-NEXT:    fld ft11, 128(sp)
+; RV64-NEXT:    fld ft4, 152(sp)
+; RV64-NEXT:    fld ft5, 144(sp)
+; RV64-NEXT:    fld ft7, 136(sp)
+; RV64-NEXT:    fld ft8, 128(sp)
 ; RV64-NEXT:    fld fs0, 120(sp)
 ; RV64-NEXT:    fld fs1, 112(sp)
 ; RV64-NEXT:    fld fs2, 104(sp)
@@ -1427,33 +1423,33 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double
 ; RV64-NEXT:    fld fs7, 80(sp)
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vfmv.v.f v8, fa2
+; RV64-NEXT:    vfmv.v.f v10, fa0
+; RV64-NEXT:    vfmv.v.f v11, fa4
+; RV64-NEXT:    vfmv.v.f v12, fa6
+; RV64-NEXT:    vmv.v.x v13, a0
+; RV64-NEXT:    vmv.v.x v14, a2
 ; RV64-NEXT:    vfslide1down.vf v9, v8, fa3
-; RV64-NEXT:    vfmv.v.f v8, fa0
-; RV64-NEXT:    vfslide1down.vf v8, v8, fa1
-; RV64-NEXT:    vfmv.v.f v10, fa4
-; RV64-NEXT:    vfslide1down.vf v10, v10, fa5
-; RV64-NEXT:    vfmv.v.f v11, fa6
-; RV64-NEXT:    vfslide1down.vf v11, v11, fa7
-; RV64-NEXT:    vmv.v.x v12, a0
-; RV64-NEXT:    vfslide1down.vf v12, v12, ft7
-; RV64-NEXT:    vmv.v.x v13, a2
-; RV64-NEXT:    vfslide1down.vf v13, v13, ft6
-; RV64-NEXT:    vmv.v.x v14, a4
-; RV64-NEXT:    vfslide1down.vf v14, v14, ft5
-; RV64-NEXT:    vmv.v.x v15, a6
-; RV64-NEXT:    vfslide1down.vf v15, v15, ft4
-; RV64-NEXT:    vfmv.v.f v16, fs7
-; RV64-NEXT:    vfslide1down.vf v17, v16, fs6
+; RV64-NEXT:    vfslide1down.vf v8, v10, fa1
+; RV64-NEXT:    vfslide1down.vf v10, v11, fa5
+; RV64-NEXT:    vfslide1down.vf v11, v12, fa7
+; RV64-NEXT:    vfmv.v.f v15, fs7
 ; RV64-NEXT:    vfmv.v.f v16, fs5
+; RV64-NEXT:    vfslide1down.vf v12, v13, ft11
+; RV64-NEXT:    vfslide1down.vf v13, v14, ft10
+; RV64-NEXT:    vfslide1down.vf v17, v15, fs6
 ; RV64-NEXT:    vfslide1down.vf v16, v16, fs4
-; RV64-NEXT:    vfmv.v.f v18, fs3
-; RV64-NEXT:    vfslide1down.vf v18, v18, fs2
+; RV64-NEXT:    vmv.v.x v14, a4
+; RV64-NEXT:    vfslide1down.vf v14, v14, ft9
+; RV64-NEXT:    vfmv.v.f v15, fs3
+; RV64-NEXT:    vfslide1down.vf v18, v15, fs2
+; RV64-NEXT:    vmv.v.x v15, a6
+; RV64-NEXT:    vfslide1down.vf v15, v15, ft6
 ; RV64-NEXT:    vfmv.v.f v19, fs1
 ; RV64-NEXT:    vfslide1down.vf v19, v19, fs0
-; RV64-NEXT:    vfmv.v.f v20, ft11
-; RV64-NEXT:    vfslide1down.vf v20, v20, ft10
-; RV64-NEXT:    vfmv.v.f v21, ft9
-; RV64-NEXT:    vfslide1down.vf v21, v21, ft8
+; RV64-NEXT:    vfmv.v.f v20, ft8
+; RV64-NEXT:    vfslide1down.vf v20, v20, ft7
+; RV64-NEXT:    vfmv.v.f v21, ft5
+; RV64-NEXT:    vfslide1down.vf v21, v21, ft4
 ; RV64-NEXT:    vfmv.v.f v22, ft3
 ; RV64-NEXT:    vfslide1down.vf v22, v22, ft2
 ; RV64-NEXT:    vfmv.v.f v23, ft1
@@ -1752,15 +1748,15 @@ define <8 x float> @buildvec_v8f32_zvl256(float %e0, float %e1, float %e2, float
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v8, fa0
+; CHECK-NEXT:    vfmv.v.f v9, fa4
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa1
+; CHECK-NEXT:    vfslide1down.vf v9, v9, fa5
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa2
-; CHECK-NEXT:    vfslide1down.vf v9, v8, fa3
-; CHECK-NEXT:    vfmv.v.f v8, fa4
-; CHECK-NEXT:    vfslide1down.vf v8, v8, fa5
-; CHECK-NEXT:    vfslide1down.vf v8, v8, fa6
-; CHECK-NEXT:    vmv.v.i v0, 15
-; CHECK-NEXT:    vfslide1down.vf v8, v8, fa7
-; CHECK-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; CHECK-NEXT:    vfslide1down.vf v9, v9, fa6
+; CHECK-NEXT:    vfslide1down.vf v10, v8, fa3
+; CHECK-NEXT:    vfslide1down.vf v8, v9, fa7
+; CHECK-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; CHECK-NEXT:    ret
   %v0 = insertelement <8 x float> poison, float %e0, i64 0
   %v1 = insertelement <8 x float> %v0, float %e1, i64 1
@@ -1803,15 +1799,15 @@ define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, d
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v8, fa0
+; CHECK-NEXT:    vfmv.v.f v9, fa4
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa1
+; CHECK-NEXT:    vfslide1down.vf v9, v9, fa5
 ; CHECK-NEXT:    vfslide1down.vf v8, v8, fa2
-; CHECK-NEXT:    vfslide1down.vf v9, v8, fa3
-; CHECK-NEXT:    vfmv.v.f v8, fa4
-; CHECK-NEXT:    vfslide1down.vf v8, v8, fa5
-; CHECK-NEXT:    vfslide1down.vf v8, v8, fa6
-; CHECK-NEXT:    vmv.v.i v0, 15
-; CHECK-NEXT:    vfslide1down.vf v8, v8, fa7
-; CHECK-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; CHECK-NEXT:    vfslide1down.vf v9, v9, fa6
+; CHECK-NEXT:    vfslide1down.vf v10, v8, fa3
+; CHECK-NEXT:    vfslide1down.vf v8, v9, fa7
+; CHECK-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; CHECK-NEXT:    ret
   %v0 = insertelement <8 x double> poison, double %e0, i64 0
   %v1 = insertelement <8 x double> %v0, double %e1, i64 1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
index 5ae47a01a37e15..e4b8e9debad271 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
@@ -41,12 +41,10 @@ define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) {
 ; V128-NEXT:    vmv1r.v v12, v9
 ; V128-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; V128-NEXT:    vid.v v9
+; V128-NEXT:    vmv.v.i v0, 10
 ; V128-NEXT:    vsrl.vi v14, v9, 1
-; V128-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; V128-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; V128-NEXT:    vrgatherei16.vv v10, v8, v14
-; V128-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; V128-NEXT:    vmv.v.i v0, 10
-; V128-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; V128-NEXT:    vrgatherei16.vv v10, v12, v14, v0.t
 ; V128-NEXT:    vmv.v.v v8, v10
 ; V128-NEXT:    ret
@@ -244,26 +242,27 @@ define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
 ; V128-NEXT:    slli a0, a0, 3
 ; V128-NEXT:    sub sp, sp, a0
 ; V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; V128-NEXT:    vmv8r.v v0, v16
-; V128-NEXT:    addi a0, sp, 16
-; V128-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; V128-NEXT:    vmv8r.v v24, v16
 ; V128-NEXT:    vmv8r.v v16, v8
+; V128-NEXT:    vmv8r.v v8, v24
+; V128-NEXT:    addi a0, sp, 16
+; V128-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT:    vslidedown.vi v8, v0, 16
-; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT:    vwaddu.vv v24, v0, v8
+; V128-NEXT:    vslidedown.vi v0, v24, 16
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v24, a0, v8
+; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; V128-NEXT:    vwaddu.vv v24, v8, v0
+; V128-NEXT:    vwmaccu.vx v24, a0, v0
 ; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; V128-NEXT:    vslidedown.vi v0, v16, 16
+; V128-NEXT:    lui a1, 699051
+; V128-NEXT:    li a2, 32
 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; V128-NEXT:    vwaddu.vv v8, v0, v16
-; V128-NEXT:    vwmaccu.vx v8, a0, v16
-; V128-NEXT:    lui a1, 699051
 ; V128-NEXT:    addi a1, a1, -1366
 ; V128-NEXT:    vmv.s.x v0, a1
-; V128-NEXT:    li a1, 32
-; V128-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; V128-NEXT:    vwmaccu.vx v8, a0, v16
+; V128-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; V128-NEXT:    vmerge.vvm v24, v8, v24, v0
 ; V128-NEXT:    addi a1, sp, 16
 ; V128-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll
index 31e2d75e514b41..c14eae0b1de61a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll
@@ -558,13 +558,11 @@ define void @fcmp_ord_vv_v4f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10
-; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v8, v8
+; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v8, 0
 ; ZVFHMIN-NEXT:    vmerge.vim v8, v8, 1, v0
@@ -610,13 +608,11 @@ define void @fcmp_uno_vv_v4f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10
-; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmfne.vv v8, v8, v8
+; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v8, 0
 ; ZVFHMIN-NEXT:    vmerge.vim v8, v8, 1, v0
@@ -1195,13 +1191,13 @@ define void @fcmp_ord_vf_v4f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v8, 0
@@ -1249,13 +1245,13 @@ define void @fcmp_uno_vf_v4f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v8, 0
@@ -1836,13 +1832,13 @@ define void @fcmp_ord_fv_v4f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v8, 0
@@ -1890,13 +1886,13 @@ define void @fcmp_uno_fv_v4f16(ptr %x, half %y, ptr %z) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v8, 0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index c24ade1e6d8eff..36bbec12e9b06c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -901,11 +901,11 @@ define void @copysign_vf_v8bf16(ptr %x, bfloat %y) {
 ; CHECK-NEXT:    fmv.x.w a1, fa0
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a2, 8
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    lui a1, 8
-; CHECK-NEXT:    addi a2, a1, -1
-; CHECK-NEXT:    vand.vx v8, v8, a2
-; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    addi a1, a2, -1
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -923,12 +923,12 @@ define void @copysign_vf_v6bf16(ptr %x, bfloat %y) {
 ; CHECK-NEXT:    fmv.x.w a1, fa0
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a2, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    lui a1, 8
-; CHECK-NEXT:    addi a2, a1, -1
-; CHECK-NEXT:    vand.vx v8, v8, a2
-; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    addi a1, a2, -1
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    vse16.v v8, (a0)
@@ -955,11 +955,11 @@ define void @copysign_vf_v8f16(ptr %x, half %y) {
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    addi a2, a1, -1
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a2
-; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
+; ZVFHMIN-NEXT:    addi a1, a2, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a2
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
@@ -985,12 +985,12 @@ define void @copysign_vf_v6f16(ptr %x, half %y) {
 ; ZVFHMIN-NEXT:    fmv.x.w a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    addi a2, a1, -1
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a2
-; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
+; ZVFHMIN-NEXT:    addi a1, a2, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a2
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    vse16.v v8, (a0)
@@ -1042,8 +1042,8 @@ define void @copysign_neg_v8bf16(ptr %x, ptr %y) {
 ; CHECK-NEXT:    vle16.v v8, (a1)
 ; CHECK-NEXT:    vle16.v v9, (a0)
 ; CHECK-NEXT:    lui a1, 8
-; CHECK-NEXT:    vxor.vx v8, v8, a1
 ; CHECK-NEXT:    addi a2, a1, -1
+; CHECK-NEXT:    vxor.vx v8, v8, a1
 ; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vor.vv v8, v9, v8
@@ -1064,9 +1064,9 @@ define void @copysign_neg_v6bf16(ptr %x, ptr %y) {
 ; CHECK-NEXT:    vle16.v v8, (a1)
 ; CHECK-NEXT:    vle16.v v9, (a0)
 ; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    addi a2, a1, -1
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    addi a2, a1, -1
 ; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
@@ -1097,8 +1097,8 @@ define void @copysign_neg_v8f16(ptr %x, ptr %y) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    addi a2, a1, -1
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a2
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v9, v8
@@ -1128,9 +1128,9 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) {
 ; ZVFHMIN-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-NEXT:    vle16.v v9, (a0)
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    addi a2, a1, -1
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    addi a2, a1, -1
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a2
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
@@ -1602,11 +1602,11 @@ define void @fmsub_v8bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-NEXT:    vle16.v v10, (a1)
 ; CHECK-NEXT:    lui a1, 8
 ; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vfmadd.vv v8, v12, v14
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -1630,11 +1630,11 @@ define void @fmsub_v6bf16(ptr %x, ptr %y, ptr %z) {
 ; CHECK-NEXT:    lui a1, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vfmadd.vv v8, v12, v14
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -1667,11 +1667,11 @@ define void @fmsub_v8f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-NEXT:    vle16.v v10, (a1)
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -1705,11 +1705,11 @@ define void @fmsub_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v12, v14
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -3717,14 +3717,14 @@ define void @fmsub_vf_v8bf16(ptr %x, ptr %y, bfloat %z) {
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a1)
 ; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vmv.v.x v10, a2
 ; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    vmv.v.x v10, a2
 ; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vfmadd.vv v8, v12, v14
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -3746,15 +3746,15 @@ define void @fmsub_vf_v6bf16(ptr %x, ptr %y, bfloat %z) {
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a1)
 ; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    lui a1, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a2
-; CHECK-NEXT:    lui a1, 8
 ; CHECK-NEXT:    vxor.vx v8, v8, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vfmadd.vv v8, v12, v14
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
 ; CHECK-NEXT:    vse16.v v10, (a0)
@@ -3785,14 +3785,14 @@ define void @fmsub_vf_v8f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-NEXT:    vle16.v v9, (a0)
-; ZVFHMIN-NEXT:    vmv.v.x v10, a2
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vmv.v.x v10, a2
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -3823,15 +3823,15 @@ define void @fmsub_vf_v6f16(ptr %x, ptr %y, half %z) {
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a1)
 ; ZVFHMIN-NEXT:    vle16.v v9, (a0)
+; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a2
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v12, v14
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vse16.v v10, (a0)
@@ -3929,11 +3929,11 @@ define void @trunc_v8bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -3954,12 +3954,12 @@ define void @trunc_v6bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -3995,11 +3995,11 @@ define void @trunc_v8f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -4036,12 +4036,12 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -4062,9 +4062,9 @@ define void @trunc_v4f32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -4104,11 +4104,11 @@ define void @ceil_v8bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4131,12 +4131,12 @@ define void @ceil_v6bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4176,11 +4176,11 @@ define void @ceil_v8f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4221,12 +4221,12 @@ define void @ceil_v6f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 3
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4249,9 +4249,9 @@ define void @ceil_v4f32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a1, 3
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
@@ -4295,11 +4295,11 @@ define void @floor_v8bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4322,12 +4322,12 @@ define void @floor_v6bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4367,11 +4367,11 @@ define void @floor_v8f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4412,12 +4412,12 @@ define void @floor_v6f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4440,9 +4440,9 @@ define void @floor_v4f32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a1, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
@@ -4486,11 +4486,11 @@ define void @round_v8bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4513,12 +4513,12 @@ define void @round_v6bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4558,11 +4558,11 @@ define void @round_v8f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4603,12 +4603,12 @@ define void @round_v6f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4631,9 +4631,9 @@ define void @round_v4f32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
@@ -4677,11 +4677,11 @@ define void @rint_v8bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -4717,11 +4717,11 @@ define void @rint_v8f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -4742,9 +4742,9 @@ define void @rint_v4f32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
@@ -4784,11 +4784,11 @@ define void @nearbyint_v8bf16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a1, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    frflags a1
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4828,11 +4828,11 @@ define void @nearbyint_v8f16(ptr %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vle16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a1, 307200
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a1, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    frflags a1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -4855,9 +4855,9 @@ define void @nearbyint_v4f32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a1
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
index da0bc5983a1251..7f4483a8f77d9c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
@@ -124,21 +124,21 @@ define <3 x i15> @fp2si_v3f32_v3i15(<3 x float> %x) {
 ; ZVFH32:       # %bb.0:
 ; ZVFH32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH32-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; ZVFH32-NEXT:    lui a1, 8
 ; ZVFH32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFH32-NEXT:    vmv.x.s a1, v8
 ; ZVFH32-NEXT:    vmv.x.s a2, v9
-; ZVFH32-NEXT:    lui a3, 8
-; ZVFH32-NEXT:    addi a3, a3, -1
-; ZVFH32-NEXT:    and a2, a2, a3
-; ZVFH32-NEXT:    vslidedown.vi v8, v9, 1
-; ZVFH32-NEXT:    vmv.x.s a4, v8
-; ZVFH32-NEXT:    and a3, a4, a3
-; ZVFH32-NEXT:    slli a4, a1, 17
+; ZVFH32-NEXT:    addi a1, a1, -1
+; ZVFH32-NEXT:    vslidedown.vi v9, v9, 1
+; ZVFH32-NEXT:    vmv.x.s a3, v8
+; ZVFH32-NEXT:    and a2, a2, a1
+; ZVFH32-NEXT:    vmv.x.s a4, v9
+; ZVFH32-NEXT:    and a1, a4, a1
+; ZVFH32-NEXT:    slli a4, a3, 17
+; ZVFH32-NEXT:    slli a3, a3, 30
 ; ZVFH32-NEXT:    srli a4, a4, 19
-; ZVFH32-NEXT:    slli a3, a3, 15
-; ZVFH32-NEXT:    slli a1, a1, 30
+; ZVFH32-NEXT:    slli a1, a1, 15
+; ZVFH32-NEXT:    or a2, a2, a3
 ; ZVFH32-NEXT:    or a1, a2, a1
-; ZVFH32-NEXT:    or a1, a1, a3
 ; ZVFH32-NEXT:    sw a1, 0(a0)
 ; ZVFH32-NEXT:    sh a4, 4(a0)
 ; ZVFH32-NEXT:    ret
@@ -147,19 +147,19 @@ define <3 x i15> @fp2si_v3f32_v3i15(<3 x float> %x) {
 ; ZVFH64:       # %bb.0:
 ; ZVFH64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH64-NEXT:    vfncvt.rtz.x.f.w v9, v8
-; ZVFH64-NEXT:    vmv.x.s a1, v9
-; ZVFH64-NEXT:    lui a2, 8
-; ZVFH64-NEXT:    addiw a2, a2, -1
-; ZVFH64-NEXT:    and a1, a1, a2
+; ZVFH64-NEXT:    lui a1, 8
+; ZVFH64-NEXT:    vmv.x.s a2, v9
+; ZVFH64-NEXT:    addiw a1, a1, -1
 ; ZVFH64-NEXT:    vslidedown.vi v8, v9, 1
+; ZVFH64-NEXT:    vslidedown.vi v9, v9, 2
+; ZVFH64-NEXT:    and a2, a2, a1
 ; ZVFH64-NEXT:    vmv.x.s a3, v8
-; ZVFH64-NEXT:    and a2, a3, a2
-; ZVFH64-NEXT:    slli a2, a2, 15
-; ZVFH64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFH64-NEXT:    vmv.x.s a3, v8
+; ZVFH64-NEXT:    and a1, a3, a1
+; ZVFH64-NEXT:    vmv.x.s a3, v9
 ; ZVFH64-NEXT:    slli a3, a3, 30
-; ZVFH64-NEXT:    or a1, a1, a3
-; ZVFH64-NEXT:    or a1, a1, a2
+; ZVFH64-NEXT:    slli a1, a1, 15
+; ZVFH64-NEXT:    or a2, a2, a3
+; ZVFH64-NEXT:    or a1, a2, a1
 ; ZVFH64-NEXT:    slli a2, a1, 19
 ; ZVFH64-NEXT:    srli a2, a2, 51
 ; ZVFH64-NEXT:    sw a1, 0(a0)
@@ -170,21 +170,21 @@ define <3 x i15> @fp2si_v3f32_v3i15(<3 x float> %x) {
 ; ZVFHMIN32:       # %bb.0:
 ; ZVFHMIN32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN32-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; ZVFHMIN32-NEXT:    lui a1, 8
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
 ; ZVFHMIN32-NEXT:    vmv.x.s a2, v9
-; ZVFHMIN32-NEXT:    lui a3, 8
-; ZVFHMIN32-NEXT:    addi a3, a3, -1
-; ZVFHMIN32-NEXT:    and a2, a2, a3
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v9, 1
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
-; ZVFHMIN32-NEXT:    and a3, a4, a3
-; ZVFHMIN32-NEXT:    slli a4, a1, 17
+; ZVFHMIN32-NEXT:    addi a1, a1, -1
+; ZVFHMIN32-NEXT:    vslidedown.vi v9, v9, 1
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN32-NEXT:    and a2, a2, a1
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v9
+; ZVFHMIN32-NEXT:    and a1, a4, a1
+; ZVFHMIN32-NEXT:    slli a4, a3, 17
+; ZVFHMIN32-NEXT:    slli a3, a3, 30
 ; ZVFHMIN32-NEXT:    srli a4, a4, 19
-; ZVFHMIN32-NEXT:    slli a3, a3, 15
-; ZVFHMIN32-NEXT:    slli a1, a1, 30
+; ZVFHMIN32-NEXT:    slli a1, a1, 15
+; ZVFHMIN32-NEXT:    or a2, a2, a3
 ; ZVFHMIN32-NEXT:    or a1, a2, a1
-; ZVFHMIN32-NEXT:    or a1, a1, a3
 ; ZVFHMIN32-NEXT:    sw a1, 0(a0)
 ; ZVFHMIN32-NEXT:    sh a4, 4(a0)
 ; ZVFHMIN32-NEXT:    ret
@@ -193,19 +193,19 @@ define <3 x i15> @fp2si_v3f32_v3i15(<3 x float> %x) {
 ; ZVFHMIN64:       # %bb.0:
 ; ZVFHMIN64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN64-NEXT:    vfncvt.rtz.x.f.w v9, v8
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v9
-; ZVFHMIN64-NEXT:    lui a2, 8
-; ZVFHMIN64-NEXT:    addiw a2, a2, -1
-; ZVFHMIN64-NEXT:    and a1, a1, a2
+; ZVFHMIN64-NEXT:    lui a1, 8
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v9
+; ZVFHMIN64-NEXT:    addiw a1, a1, -1
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v9, 1
+; ZVFHMIN64-NEXT:    vslidedown.vi v9, v9, 2
+; ZVFHMIN64-NEXT:    and a2, a2, a1
 ; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    and a2, a3, a2
-; ZVFHMIN64-NEXT:    slli a2, a2, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN64-NEXT:    and a1, a3, a1
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v9
 ; ZVFHMIN64-NEXT:    slli a3, a3, 30
-; ZVFHMIN64-NEXT:    or a1, a1, a3
-; ZVFHMIN64-NEXT:    or a1, a1, a2
+; ZVFHMIN64-NEXT:    slli a1, a1, 15
+; ZVFHMIN64-NEXT:    or a2, a2, a3
+; ZVFHMIN64-NEXT:    or a1, a2, a1
 ; ZVFHMIN64-NEXT:    slli a2, a1, 19
 ; ZVFHMIN64-NEXT:    srli a2, a2, 51
 ; ZVFHMIN64-NEXT:    sw a1, 0(a0)
@@ -221,21 +221,21 @@ define <3 x i15> @fp2ui_v3f32_v3i15(<3 x float> %x) {
 ; ZVFH32:       # %bb.0:
 ; ZVFH32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH32-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; ZVFH32-NEXT:    lui a1, 16
 ; ZVFH32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFH32-NEXT:    vmv.x.s a1, v8
 ; ZVFH32-NEXT:    vmv.x.s a2, v9
-; ZVFH32-NEXT:    lui a3, 16
-; ZVFH32-NEXT:    addi a3, a3, -1
-; ZVFH32-NEXT:    and a2, a2, a3
-; ZVFH32-NEXT:    vslidedown.vi v8, v9, 1
-; ZVFH32-NEXT:    vmv.x.s a4, v8
-; ZVFH32-NEXT:    and a3, a4, a3
-; ZVFH32-NEXT:    slli a4, a1, 17
+; ZVFH32-NEXT:    addi a1, a1, -1
+; ZVFH32-NEXT:    vslidedown.vi v9, v9, 1
+; ZVFH32-NEXT:    vmv.x.s a3, v8
+; ZVFH32-NEXT:    and a2, a2, a1
+; ZVFH32-NEXT:    vmv.x.s a4, v9
+; ZVFH32-NEXT:    and a1, a4, a1
+; ZVFH32-NEXT:    slli a4, a3, 17
+; ZVFH32-NEXT:    slli a3, a3, 30
 ; ZVFH32-NEXT:    srli a4, a4, 19
-; ZVFH32-NEXT:    slli a3, a3, 15
-; ZVFH32-NEXT:    slli a1, a1, 30
+; ZVFH32-NEXT:    slli a1, a1, 15
+; ZVFH32-NEXT:    or a2, a2, a3
 ; ZVFH32-NEXT:    or a1, a2, a1
-; ZVFH32-NEXT:    or a1, a1, a3
 ; ZVFH32-NEXT:    sw a1, 0(a0)
 ; ZVFH32-NEXT:    sh a4, 4(a0)
 ; ZVFH32-NEXT:    ret
@@ -244,19 +244,19 @@ define <3 x i15> @fp2ui_v3f32_v3i15(<3 x float> %x) {
 ; ZVFH64:       # %bb.0:
 ; ZVFH64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFH64-NEXT:    vfncvt.rtz.x.f.w v9, v8
-; ZVFH64-NEXT:    vmv.x.s a1, v9
-; ZVFH64-NEXT:    lui a2, 16
-; ZVFH64-NEXT:    addiw a2, a2, -1
-; ZVFH64-NEXT:    and a1, a1, a2
+; ZVFH64-NEXT:    lui a1, 16
+; ZVFH64-NEXT:    vmv.x.s a2, v9
+; ZVFH64-NEXT:    addiw a1, a1, -1
 ; ZVFH64-NEXT:    vslidedown.vi v8, v9, 1
+; ZVFH64-NEXT:    vslidedown.vi v9, v9, 2
+; ZVFH64-NEXT:    and a2, a2, a1
 ; ZVFH64-NEXT:    vmv.x.s a3, v8
-; ZVFH64-NEXT:    and a2, a3, a2
-; ZVFH64-NEXT:    slli a2, a2, 15
-; ZVFH64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFH64-NEXT:    vmv.x.s a3, v8
+; ZVFH64-NEXT:    and a1, a3, a1
+; ZVFH64-NEXT:    vmv.x.s a3, v9
 ; ZVFH64-NEXT:    slli a3, a3, 30
-; ZVFH64-NEXT:    or a1, a1, a3
-; ZVFH64-NEXT:    or a1, a1, a2
+; ZVFH64-NEXT:    slli a1, a1, 15
+; ZVFH64-NEXT:    or a2, a2, a3
+; ZVFH64-NEXT:    or a1, a2, a1
 ; ZVFH64-NEXT:    slli a2, a1, 19
 ; ZVFH64-NEXT:    srli a2, a2, 51
 ; ZVFH64-NEXT:    sw a1, 0(a0)
@@ -267,21 +267,21 @@ define <3 x i15> @fp2ui_v3f32_v3i15(<3 x float> %x) {
 ; ZVFHMIN32:       # %bb.0:
 ; ZVFHMIN32-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN32-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; ZVFHMIN32-NEXT:    lui a1, 16
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
 ; ZVFHMIN32-NEXT:    vmv.x.s a2, v9
-; ZVFHMIN32-NEXT:    lui a3, 16
-; ZVFHMIN32-NEXT:    addi a3, a3, -1
-; ZVFHMIN32-NEXT:    and a2, a2, a3
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v9, 1
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
-; ZVFHMIN32-NEXT:    and a3, a4, a3
-; ZVFHMIN32-NEXT:    slli a4, a1, 17
+; ZVFHMIN32-NEXT:    addi a1, a1, -1
+; ZVFHMIN32-NEXT:    vslidedown.vi v9, v9, 1
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN32-NEXT:    and a2, a2, a1
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v9
+; ZVFHMIN32-NEXT:    and a1, a4, a1
+; ZVFHMIN32-NEXT:    slli a4, a3, 17
+; ZVFHMIN32-NEXT:    slli a3, a3, 30
 ; ZVFHMIN32-NEXT:    srli a4, a4, 19
-; ZVFHMIN32-NEXT:    slli a3, a3, 15
-; ZVFHMIN32-NEXT:    slli a1, a1, 30
+; ZVFHMIN32-NEXT:    slli a1, a1, 15
+; ZVFHMIN32-NEXT:    or a2, a2, a3
 ; ZVFHMIN32-NEXT:    or a1, a2, a1
-; ZVFHMIN32-NEXT:    or a1, a1, a3
 ; ZVFHMIN32-NEXT:    sw a1, 0(a0)
 ; ZVFHMIN32-NEXT:    sh a4, 4(a0)
 ; ZVFHMIN32-NEXT:    ret
@@ -290,19 +290,19 @@ define <3 x i15> @fp2ui_v3f32_v3i15(<3 x float> %x) {
 ; ZVFHMIN64:       # %bb.0:
 ; ZVFHMIN64-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN64-NEXT:    vfncvt.rtz.x.f.w v9, v8
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v9
-; ZVFHMIN64-NEXT:    lui a2, 16
-; ZVFHMIN64-NEXT:    addiw a2, a2, -1
-; ZVFHMIN64-NEXT:    and a1, a1, a2
+; ZVFHMIN64-NEXT:    lui a1, 16
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v9
+; ZVFHMIN64-NEXT:    addiw a1, a1, -1
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v9, 1
+; ZVFHMIN64-NEXT:    vslidedown.vi v9, v9, 2
+; ZVFHMIN64-NEXT:    and a2, a2, a1
 ; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    and a2, a3, a2
-; ZVFHMIN64-NEXT:    slli a2, a2, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v9, 2
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN64-NEXT:    and a1, a3, a1
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v9
 ; ZVFHMIN64-NEXT:    slli a3, a3, 30
-; ZVFHMIN64-NEXT:    or a1, a1, a3
-; ZVFHMIN64-NEXT:    or a1, a1, a2
+; ZVFHMIN64-NEXT:    slli a1, a1, 15
+; ZVFHMIN64-NEXT:    or a2, a2, a3
+; ZVFHMIN64-NEXT:    or a1, a2, a1
 ; ZVFHMIN64-NEXT:    slli a2, a1, 19
 ; ZVFHMIN64-NEXT:    srli a2, a2, 51
 ; ZVFHMIN64-NEXT:    sw a1, 0(a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
index 131fa53b359998..be32c033fe3738 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
@@ -125,10 +125,10 @@ define <32 x half> @round_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: round_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI5_0)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -150,10 +150,10 @@ define <1 x float> @round_v1f32(<1 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -173,10 +173,10 @@ define <2 x float> @round_v2f32(<2 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -196,10 +196,10 @@ define <4 x float> @round_v4f32(<4 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -219,10 +219,10 @@ define <8 x float> @round_v8f32(<8 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -242,10 +242,10 @@ define <16 x float> @round_v16f32(<16 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll
index b21be367f8ef5e..774ce5c7859c9d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll
@@ -30,9 +30,9 @@ define <1 x half> @round_v1f16(<1 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -69,9 +69,9 @@ define <2 x half> @round_v2f16(<2 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -108,9 +108,9 @@ define <4 x half> @round_v4f16(<4 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -147,9 +147,9 @@ define <8 x half> @round_v8f16(<8 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -186,9 +186,9 @@ define <16 x half> @round_v16f16(<16 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -225,12 +225,12 @@ define <32 x half> @round_v32f16(<32 x half> %x) {
 ; ZVFHMIN-LABEL: round_v32f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    li a0, 32
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
index 37f2e59ad7516d..5c0279e133dfaa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll
@@ -125,10 +125,10 @@ define <32 x half> @roundeven_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: roundeven_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI5_0)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -150,10 +150,10 @@ define <1 x float> @roundeven_v1f32(<1 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -173,10 +173,10 @@ define <2 x float> @roundeven_v2f32(<2 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -196,10 +196,10 @@ define <4 x float> @roundeven_v4f32(<4 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -219,10 +219,10 @@ define <8 x float> @roundeven_v8f32(<8 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -242,10 +242,10 @@ define <16 x float> @roundeven_v16f32(<16 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll
index 13d62bb24441c9..0b6baad127643a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll
@@ -30,9 +30,9 @@ define <1 x half> @roundeven_v1f16(<1 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -69,9 +69,9 @@ define <2 x half> @roundeven_v2f16(<2 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -108,9 +108,9 @@ define <4 x half> @roundeven_v4f16(<4 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -147,9 +147,9 @@ define <8 x half> @roundeven_v8f16(<8 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -186,9 +186,9 @@ define <16 x half> @roundeven_v16f16(<16 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -225,12 +225,12 @@ define <32 x half> @roundeven_v32f16(<32 x half> %x) {
 ; ZVFHMIN-LABEL: roundeven_v32f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    li a0, 32
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll
index 86abfb771162fc..a68dc11f3d21e7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll
@@ -674,10 +674,10 @@ define <16 x i64> @fshr_v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vand.vx v8, v24, a0, v0.t
@@ -710,11 +710,11 @@ define <16 x i64> @fshl_v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vand.vx v8, v24, a0, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll
index b911722368ce3a..2173887e854178 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll
@@ -113,10 +113,10 @@ define <32 x half> @trunc_v32f16(<32 x half> %x) strictfp {
 ; CHECK-LABEL: trunc_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    lui a1, %hi(.LCPI5_0)
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    lui a0, %hi(.LCPI5_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a0)
+; CHECK-NEXT:    flh fa5, %lo(.LCPI5_0)(a1)
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
@@ -136,10 +136,10 @@ define <1 x float> @trunc_v1f32(<1 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
@@ -157,10 +157,10 @@ define <2 x float> @trunc_v2f32(<2 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
@@ -178,10 +178,10 @@ define <4 x float> @trunc_v4f32(<4 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
@@ -199,10 +199,10 @@ define <8 x float> @trunc_v8f32(<8 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
@@ -220,10 +220,10 @@ define <16 x float> @trunc_v16f32(<16 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v12, v8, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll
index f7737784d4ca57..986636d974acaa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll
@@ -19,8 +19,8 @@ define <1 x i1> @insertelt_idx_v1i1(<1 x i1> %x, i1 %elt, i32 zeroext %idx) noun
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v8, zero
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    addi a2, a1, 1
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a2, e8, mf8, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
@@ -52,8 +52,8 @@ define <2 x i1> @insertelt_idx_v2i1(<2 x i1> %x, i1 %elt, i32 zeroext %idx) noun
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    addi a2, a1, 1
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a2, e8, mf8, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
@@ -87,8 +87,8 @@ define <8 x i1> @insertelt_idx_v8i1(<8 x i1> %x, i1 %elt, i32 zeroext %idx) noun
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    addi a2, a1, 1
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index 5581754b0721a5..1fbc8dfd688c4b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -756,15 +756,15 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
 ; RV32VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32VLA-NEXT:    vle64.v v8, (a0)
 ; RV32VLA-NEXT:    addi a0, sp, 128
+; RV32VLA-NEXT:    csrr a2, vlenb
+; RV32VLA-NEXT:    addi a3, sp, 64
+; RV32VLA-NEXT:    slli a2, a2, 3
 ; RV32VLA-NEXT:    vse64.v v8, (a0)
-; RV32VLA-NEXT:    csrr a0, vlenb
-; RV32VLA-NEXT:    slli a0, a0, 3
-; RV32VLA-NEXT:    addi a2, sp, 64
-; RV32VLA-NEXT:    add a3, a2, a0
-; RV32VLA-NEXT:    vl8re64.v v8, (a3)
-; RV32VLA-NEXT:    vl8re64.v v16, (a2)
-; RV32VLA-NEXT:    add a0, a1, a0
-; RV32VLA-NEXT:    vs8r.v v8, (a0)
+; RV32VLA-NEXT:    add a0, a3, a2
+; RV32VLA-NEXT:    vl8re64.v v8, (a0)
+; RV32VLA-NEXT:    vl8re64.v v16, (a3)
+; RV32VLA-NEXT:    add a2, a1, a2
+; RV32VLA-NEXT:    vs8r.v v8, (a2)
 ; RV32VLA-NEXT:    vs8r.v v16, (a1)
 ; RV32VLA-NEXT:    addi sp, s0, -80
 ; RV32VLA-NEXT:    .cfi_def_cfa sp, 80
@@ -793,15 +793,15 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
 ; RV64VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64VLA-NEXT:    vle64.v v8, (a0)
 ; RV64VLA-NEXT:    addi a0, sp, 128
+; RV64VLA-NEXT:    csrr a2, vlenb
+; RV64VLA-NEXT:    addi a3, sp, 64
+; RV64VLA-NEXT:    slli a2, a2, 3
 ; RV64VLA-NEXT:    vse64.v v8, (a0)
-; RV64VLA-NEXT:    csrr a0, vlenb
-; RV64VLA-NEXT:    slli a0, a0, 3
-; RV64VLA-NEXT:    addi a2, sp, 64
-; RV64VLA-NEXT:    add a3, a2, a0
-; RV64VLA-NEXT:    vl8re64.v v8, (a3)
-; RV64VLA-NEXT:    vl8re64.v v16, (a2)
-; RV64VLA-NEXT:    add a0, a1, a0
-; RV64VLA-NEXT:    vs8r.v v8, (a0)
+; RV64VLA-NEXT:    add a0, a3, a2
+; RV64VLA-NEXT:    vl8re64.v v8, (a0)
+; RV64VLA-NEXT:    vl8re64.v v16, (a3)
+; RV64VLA-NEXT:    add a2, a1, a2
+; RV64VLA-NEXT:    vs8r.v v8, (a2)
 ; RV64VLA-NEXT:    vs8r.v v16, (a1)
 ; RV64VLA-NEXT:    addi sp, s0, -80
 ; RV64VLA-NEXT:    .cfi_def_cfa sp, 80
@@ -828,9 +828,9 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
 ; RV32VLS-NEXT:    vl1re64.v v8, (a0)
 ; RV32VLS-NEXT:    addi a0, sp, 128
 ; RV32VLS-NEXT:    vs1r.v v8, (a0)
+; RV32VLS-NEXT:    addi a0, sp, 192
+; RV32VLS-NEXT:    vl8re64.v v8, (a0)
 ; RV32VLS-NEXT:    addi a0, sp, 64
-; RV32VLS-NEXT:    addi a2, sp, 192
-; RV32VLS-NEXT:    vl8re64.v v8, (a2)
 ; RV32VLS-NEXT:    vl8re64.v v16, (a0)
 ; RV32VLS-NEXT:    addi a0, a1, 128
 ; RV32VLS-NEXT:    vs8r.v v8, (a0)
@@ -860,9 +860,9 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
 ; RV64VLS-NEXT:    vl1re64.v v8, (a0)
 ; RV64VLS-NEXT:    addi a0, sp, 128
 ; RV64VLS-NEXT:    vs1r.v v8, (a0)
+; RV64VLS-NEXT:    addi a0, sp, 192
+; RV64VLS-NEXT:    vl8re64.v v8, (a0)
 ; RV64VLS-NEXT:    addi a0, sp, 64
-; RV64VLS-NEXT:    addi a2, sp, 192
-; RV64VLS-NEXT:    vl8re64.v v8, (a2)
 ; RV64VLS-NEXT:    vl8re64.v v16, (a0)
 ; RV64VLS-NEXT:    addi a0, a1, 128
 ; RV64VLS-NEXT:    vs8r.v v8, (a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
index 0ff3641483ddbc..6782b2003ba94b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
@@ -122,14 +122,14 @@ define <64 x i32> @insertelt_v64i32_idx(<64 x i32> %a, i32 %y, i32 zeroext %idx)
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -128
 ; RV32-NEXT:    andi a1, a1, 63
-; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    mv a2, sp
-; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    addi a3, sp, 128
 ; RV32-NEXT:    li a4, 32
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
 ; RV32-NEXT:    vse32.v v16, (a3)
 ; RV32-NEXT:    vse32.v v8, (a2)
+; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    sw a0, 0(a1)
 ; RV32-NEXT:    vle32.v v8, (a2)
 ; RV32-NEXT:    vle32.v v16, (a3)
@@ -155,14 +155,14 @@ define <64 x i32> @insertelt_v64i32_idx(<64 x i32> %a, i32 %y, i32 zeroext %idx)
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -128
 ; RV64-NEXT:    andi a1, a1, 63
-; RV64-NEXT:    slli a1, a1, 2
 ; RV64-NEXT:    mv a2, sp
-; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    addi a3, sp, 128
 ; RV64-NEXT:    li a4, 32
+; RV64-NEXT:    slli a1, a1, 2
 ; RV64-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
 ; RV64-NEXT:    vse32.v v16, (a3)
 ; RV64-NEXT:    vse32.v v8, (a2)
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    sw a0, 0(a1)
 ; RV64-NEXT:    vle32.v v8, (a2)
 ; RV64-NEXT:    vle32.v v16, (a3)
@@ -228,17 +228,17 @@ define <3 x i64> @insertelt_v3i64(<3 x i64> %a, i64 %y) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v9, v8, 3
-; RV32-NEXT:    vmv.x.s a2, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 2
+; RV32-NEXT:    vslidedown.vi v10, v8, 2
+; RV32-NEXT:    vmv.x.s a2, v8
+; RV32-NEXT:    vslidedown.vi v8, v8, 1
 ; RV32-NEXT:    vmv.x.s a3, v9
-; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vmv.x.s a4, v9
+; RV32-NEXT:    vmv.x.s a4, v10
 ; RV32-NEXT:    vmv.x.s a5, v8
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a5
+; RV32-NEXT:    vmv.v.x v8, a2
+; RV32-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32-NEXT:    vslide1down.vx v8, v8, a3
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-NEXT:    vslidedown.vi v8, v8, 2
@@ -248,11 +248,11 @@ define <3 x i64> @insertelt_v3i64(<3 x i64> %a, i64 %y) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v9
-; RV64-NEXT:    vmv.x.s a2, v8
+; RV64-NEXT:    vmv.x.s a1, v8
+; RV64-NEXT:    vmv.x.s a2, v9
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a1
+; RV64-NEXT:    vmv.v.x v8, a1
+; RV64-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64-NEXT:    ret
@@ -313,9 +313,9 @@ define <32 x i16> @insertelt_v32i16(<32 x i16> %a, i16 %y, i32 %idx) {
 ; RV64-LABEL: insertelt_v32i16:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a2, 32
+; RV64-NEXT:    slli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
 ; RV64-NEXT:    vmv.s.x v12, a0
-; RV64-NEXT:    slli a1, a1, 32
 ; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    addi a0, a1, 1
 ; RV64-NEXT:    vsetvli zero, a0, e16, m4, tu, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index 49644fa4a9fb87..e9fd0a19e3eb66 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -697,12 +697,11 @@ define void @buildvec_seq_v9i8(ptr %x) {
 ; CHECK-LABEL: buildvec_seq_v9i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 73
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 3
-; CHECK-NEXT:    li a1, 146
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    li a1, 146
 ; CHECK-NEXT:    vmv.s.x v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
@@ -728,49 +727,27 @@ define void @buildvec_seq_v4i16_v2i32(ptr %x) {
 }
 
 define void @buildvec_vid_step1o2_v4i32(ptr %z0, ptr %z1, ptr %z2, ptr %z3, ptr %z4, ptr %z5, ptr %z6) {
-; RV32-LABEL: buildvec_vid_step1o2_v4i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vid.v v8
-; RV32-NEXT:    vsrl.vi v8, v8, 1
-; RV32-NEXT:    vse32.v v8, (a0)
-; RV32-NEXT:    vse32.v v8, (a1)
-; RV32-NEXT:    vmv.v.i v9, 1
-; RV32-NEXT:    vse32.v v8, (a2)
-; RV32-NEXT:    vse32.v v8, (a3)
-; RV32-NEXT:    vse32.v v8, (a4)
-; RV32-NEXT:    vmv.s.x v8, zero
-; RV32-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; RV32-NEXT:    vslideup.vi v9, v8, 1
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vse32.v v9, (a5)
-; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-NEXT:    vse32.v v8, (a6)
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: buildvec_vid_step1o2_v4i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vid.v v8
-; RV64-NEXT:    vsrl.vi v8, v8, 1
-; RV64-NEXT:    vse32.v v8, (a0)
-; RV64-NEXT:    vmv.v.i v9, 1
-; RV64-NEXT:    vse32.v v8, (a1)
-; RV64-NEXT:    vse32.v v8, (a2)
-; RV64-NEXT:    vse32.v v8, (a3)
-; RV64-NEXT:    vse32.v v8, (a4)
-; RV64-NEXT:    vmv.s.x v8, zero
-; RV64-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; RV64-NEXT:    vslideup.vi v9, v8, 1
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vse32.v v9, (a5)
-; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
-; RV64-NEXT:    vse32.v v8, (a6)
-; RV64-NEXT:    ret
+; CHECK-LABEL: buildvec_vid_step1o2_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 1
+; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vsrl.vi v9, v9, 1
+; CHECK-NEXT:    vse32.v v9, (a0)
+; CHECK-NEXT:    vse32.v v9, (a1)
+; CHECK-NEXT:    vse32.v v9, (a2)
+; CHECK-NEXT:    vse32.v v9, (a3)
+; CHECK-NEXT:    vse32.v v9, (a4)
+; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
+; CHECK-NEXT:    vse32.v v8, (a5)
+; CHECK-NEXT:    vse32.v v9, (a6)
+; CHECK-NEXT:    ret
   store <4 x i32> <i32 0, i32 0, i32 1, i32 1>, ptr %z0
   store <4 x i32> <i32 0, i32 0, i32 1, i32 undef>, ptr %z1
   store <4 x i32> <i32 0, i32 undef, i32 1, i32 1>, ptr %z2
@@ -788,22 +765,22 @@ define void @buildvec_vid_step1o2_add3_v4i16(ptr %z0, ptr %z1, ptr %z2, ptr %z3,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vmv.v.i v9, 3
 ; CHECK-NEXT:    vsrl.vi v8, v8, 1
 ; CHECK-NEXT:    vadd.vi v8, v8, 3
 ; CHECK-NEXT:    vse16.v v8, (a0)
-; CHECK-NEXT:    vmv.v.i v9, 3
 ; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    vse16.v v8, (a2)
 ; CHECK-NEXT:    vse16.v v8, (a3)
 ; CHECK-NEXT:    vse16.v v8, (a4)
 ; CHECK-NEXT:    vmv.v.i v8, 4
+; CHECK-NEXT:    li a0, 4
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
 ; CHECK-NEXT:    vse16.v v8, (a5)
-; CHECK-NEXT:    li a0, 4
-; CHECK-NEXT:    vslide1down.vx v8, v9, a0
-; CHECK-NEXT:    vse16.v v8, (a6)
+; CHECK-NEXT:    vse16.v v9, (a6)
 ; CHECK-NEXT:    ret
   store <4 x i16> <i16 3, i16 3, i16 4, i16 4>, ptr %z0
   store <4 x i16> <i16 3, i16 3, i16 4, i16 undef>, ptr %z1
@@ -997,21 +974,19 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16,
 ; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v0, 15
 ; RV32-NEXT:    vmv.v.i v9, 0
-; RV32-NEXT:    vmerge.vim v10, v9, -1, v0
 ; RV32-NEXT:    li a0, 512
-; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV32-NEXT:    vmv.v.i v12, 3
 ; RV32-NEXT:    li a1, 240
-; RV32-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; RV32-NEXT:    vmv.s.x v8, a1
+; RV32-NEXT:    li a1, 15
+; RV32-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV32-NEXT:    vmv.v.i v12, 3
+; RV32-NEXT:    slli a1, a1, 8
 ; RV32-NEXT:    vmv1r.v v0, v10
-; RV32-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
 ; RV32-NEXT:    vmerge.vim v12, v12, 0, v0
 ; RV32-NEXT:    vmv1r.v v0, v8
 ; RV32-NEXT:    vsetivli zero, 16, e32, mf2, ta, ma
 ; RV32-NEXT:    vmerge.vim v10, v9, -1, v0
-; RV32-NEXT:    li a1, 15
-; RV32-NEXT:    slli a1, a1, 8
 ; RV32-NEXT:    vmv.s.x v8, a1
 ; RV32-NEXT:    vmv1r.v v0, v10
 ; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -1029,19 +1004,17 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16,
 ; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
 ; RV64V-NEXT:    vmv.v.i v0, 3
 ; RV64V-NEXT:    vmv.v.i v9, 0
-; RV64V-NEXT:    vmerge.vim v10, v9, -1, v0
 ; RV64V-NEXT:    li a0, 512
+; RV64V-NEXT:    vmv.v.i v8, 12
+; RV64V-NEXT:    li a1, 48
+; RV64V-NEXT:    vmerge.vim v10, v9, -1, v0
 ; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; RV64V-NEXT:    vmv.v.i v12, 3
-; RV64V-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; RV64V-NEXT:    vmv.v.i v8, 12
 ; RV64V-NEXT:    vmv1r.v v0, v10
-; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; RV64V-NEXT:    vmerge.vim v12, v12, 0, v0
 ; RV64V-NEXT:    vmv1r.v v0, v8
 ; RV64V-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
 ; RV64V-NEXT:    vmerge.vim v10, v9, -1, v0
-; RV64V-NEXT:    li a1, 48
 ; RV64V-NEXT:    vmv.s.x v8, a1
 ; RV64V-NEXT:    vmv.v.v v0, v10
 ; RV64V-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -1059,21 +1032,19 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16,
 ; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; RV64ZVE32-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32-NEXT:    vmv.v.i v9, 0
-; RV64ZVE32-NEXT:    vmerge.vim v10, v9, -1, v0
 ; RV64ZVE32-NEXT:    li a0, 512
-; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; RV64ZVE32-NEXT:    vmv.v.i v12, 3
 ; RV64ZVE32-NEXT:    li a1, 240
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; RV64ZVE32-NEXT:    vmv.s.x v8, a1
+; RV64ZVE32-NEXT:    li a1, 15
+; RV64ZVE32-NEXT:    vmerge.vim v10, v9, -1, v0
+; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64ZVE32-NEXT:    vmv.v.i v12, 3
+; RV64ZVE32-NEXT:    slli a1, a1, 8
 ; RV64ZVE32-NEXT:    vmv1r.v v0, v10
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
 ; RV64ZVE32-NEXT:    vmerge.vim v12, v12, 0, v0
 ; RV64ZVE32-NEXT:    vmv1r.v v0, v8
 ; RV64ZVE32-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; RV64ZVE32-NEXT:    vmerge.vim v10, v9, -1, v0
-; RV64ZVE32-NEXT:    li a1, 15
-; RV64ZVE32-NEXT:    slli a1, a1, 8
 ; RV64ZVE32-NEXT:    vmv.s.x v8, a1
 ; RV64ZVE32-NEXT:    vmv.v.v v0, v10
 ; RV64ZVE32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
@@ -1091,9 +1062,10 @@ define <512 x i8> @buildvec_not_vid_v512i8_indices_overflow_2() vscale_range(16,
 define <8 x i32> @prefix_overwrite(<8 x i32> %vin, i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: prefix_overwrite:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v8, a0
+; CHECK-NEXT:    vsetivli zero, 8, e32, m1, tu, ma
 ; CHECK-NEXT:    vmv.s.x v10, a1
+; CHECK-NEXT:    vmv.s.x v8, a0
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    vmv.s.x v10, a2
 ; CHECK-NEXT:    vsetivli zero, 3, e32, m1, tu, ma
@@ -1137,12 +1109,12 @@ define <4 x i64> @v4xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d) vscale_range(2,2)
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a5
+; RV32-NEXT:    vslide1down.vx v10, v9, a1
 ; RV32-NEXT:    vslide1down.vx v8, v8, a6
 ; RV32-NEXT:    vslide1down.vx v9, v8, a7
-; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-NEXT:    vslide1down.vx v8, v10, a2
 ; RV32-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32-NEXT:    ret
 ;
@@ -1186,13 +1158,13 @@ define <8 x i64> @v8xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i
 ; RV32-NEXT:    lw s0, 16(sp)
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a5
+; RV32-NEXT:    vslide1down.vx v9, v9, a1
 ; RV32-NEXT:    vslide1down.vx v8, v8, a6
+; RV32-NEXT:    vslide1down.vx v10, v9, a2
 ; RV32-NEXT:    vslide1down.vx v9, v8, a7
-; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-NEXT:    vslide1down.vx v8, v10, a3
 ; RV32-NEXT:    vmv.v.x v10, s0
 ; RV32-NEXT:    vslide1down.vx v10, v10, t6
 ; RV32-NEXT:    vslide1down.vx v10, v10, t5
@@ -1211,9 +1183,9 @@ define <8 x i64> @v8xi64_exact(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i
 ; RV64V:       # %bb.0:
 ; RV64V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64V-NEXT:    vmv.v.x v8, a2
+; RV64V-NEXT:    vmv.v.x v10, a0
 ; RV64V-NEXT:    vslide1down.vx v9, v8, a3
-; RV64V-NEXT:    vmv.v.x v8, a0
-; RV64V-NEXT:    vslide1down.vx v8, v8, a1
+; RV64V-NEXT:    vslide1down.vx v8, v10, a1
 ; RV64V-NEXT:    vmv.v.x v10, a4
 ; RV64V-NEXT:    vslide1down.vx v10, v10, a5
 ; RV64V-NEXT:    vmv.v.x v11, a6
@@ -1248,12 +1220,12 @@ define <8 x i64> @v8xi64_exact_equal_halves(i64 %a, i64 %b, i64 %c, i64 %d) vsca
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a5
+; RV32-NEXT:    vslide1down.vx v10, v9, a1
 ; RV32-NEXT:    vslide1down.vx v8, v8, a6
 ; RV32-NEXT:    vslide1down.vx v9, v8, a7
-; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-NEXT:    vslide1down.vx v8, v10, a2
 ; RV32-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32-NEXT:    vmv.v.v v10, v8
 ; RV32-NEXT:    vmv.v.v v11, v9
@@ -1297,12 +1269,12 @@ define <8 x i64> @v8xi64_exact_undef_suffix(i64 %a, i64 %b, i64 %c, i64 %d) vsca
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a5
+; RV32-NEXT:    vslide1down.vx v10, v9, a1
 ; RV32-NEXT:    vslide1down.vx v8, v8, a6
 ; RV32-NEXT:    vslide1down.vx v9, v8, a7
-; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-NEXT:    vslide1down.vx v8, v10, a2
 ; RV32-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32-NEXT:    ret
 ;
@@ -1334,12 +1306,12 @@ define <8 x i64> @v8xi64_exact_undef_prefix(i64 %a, i64 %b, i64 %c, i64 %d) vsca
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a4
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a5
+; RV32-NEXT:    vslide1down.vx v9, v9, a1
 ; RV32-NEXT:    vslide1down.vx v8, v8, a6
 ; RV32-NEXT:    vslide1down.vx v11, v8, a7
-; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-NEXT:    vslide1down.vx v8, v9, a2
 ; RV32-NEXT:    vslide1down.vx v10, v8, a3
 ; RV32-NEXT:    ret
 ;
@@ -1386,32 +1358,31 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RV32-ONLY-NEXT:    lbu t2, 9(a0)
 ; RV32-ONLY-NEXT:    lbu t3, 10(a0)
 ; RV32-ONLY-NEXT:    lbu t4, 11(a0)
+; RV32-ONLY-NEXT:    li t5, 255
+; RV32-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV32-ONLY-NEXT:    vmv.s.x v0, t5
 ; RV32-ONLY-NEXT:    lbu t5, 12(a0)
 ; RV32-ONLY-NEXT:    lbu t6, 13(a0)
 ; RV32-ONLY-NEXT:    lbu s0, 14(a0)
 ; RV32-ONLY-NEXT:    lbu a0, 15(a0)
-; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a1
+; RV32-ONLY-NEXT:    vmv.v.x v9, t1
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t2
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t3
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t4
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a5
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t5
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a6
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t6
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a7
-; RV32-ONLY-NEXT:    vslide1down.vx v9, v8, t0
-; RV32-ONLY-NEXT:    vmv.v.x v8, t1
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t2
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t3
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t4
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t5
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t6
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, s0
-; RV32-ONLY-NEXT:    li a1, 255
-; RV32-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-ONLY-NEXT:    vmv.s.x v0, a1
-; RV32-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-ONLY-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, s0
+; RV32-ONLY-NEXT:    vslide1down.vx v10, v8, t0
+; RV32-ONLY-NEXT:    vslide1down.vx v8, v9, a0
+; RV32-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV32-ONLY-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32-ONLY-NEXT:    .cfi_restore s0
 ; RV32-ONLY-NEXT:    addi sp, sp, 16
@@ -1420,45 +1391,45 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ;
 ; RV32VB-LABEL: buildvec_v16i8_loads_contigous:
 ; RV32VB:       # %bb.0:
-; RV32VB-NEXT:    lbu a1, 1(a0)
-; RV32VB-NEXT:    lbu a2, 0(a0)
+; RV32VB-NEXT:    lbu a1, 0(a0)
+; RV32VB-NEXT:    lbu a2, 1(a0)
 ; RV32VB-NEXT:    lbu a3, 2(a0)
 ; RV32VB-NEXT:    lbu a4, 3(a0)
-; RV32VB-NEXT:    slli a1, a1, 8
-; RV32VB-NEXT:    or a1, a2, a1
+; RV32VB-NEXT:    lbu a5, 4(a0)
+; RV32VB-NEXT:    lbu a6, 5(a0)
+; RV32VB-NEXT:    lbu a7, 6(a0)
+; RV32VB-NEXT:    lbu t0, 7(a0)
+; RV32VB-NEXT:    slli a2, a2, 8
 ; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    slli a4, a4, 24
+; RV32VB-NEXT:    slli a6, a6, 8
+; RV32VB-NEXT:    or a1, a1, a2
 ; RV32VB-NEXT:    or a3, a4, a3
-; RV32VB-NEXT:    lbu a2, 4(a0)
-; RV32VB-NEXT:    lbu a4, 5(a0)
-; RV32VB-NEXT:    or a1, a1, a3
-; RV32VB-NEXT:    lbu a3, 6(a0)
-; RV32VB-NEXT:    lbu a5, 7(a0)
-; RV32VB-NEXT:    slli a4, a4, 8
-; RV32VB-NEXT:    or a2, a2, a4
-; RV32VB-NEXT:    slli a3, a3, 16
-; RV32VB-NEXT:    slli a5, a5, 24
-; RV32VB-NEXT:    or a3, a5, a3
+; RV32VB-NEXT:    or a2, a5, a6
 ; RV32VB-NEXT:    lbu a4, 8(a0)
 ; RV32VB-NEXT:    lbu a5, 9(a0)
-; RV32VB-NEXT:    or a2, a2, a3
-; RV32VB-NEXT:    lbu a3, 10(a0)
-; RV32VB-NEXT:    lbu a6, 11(a0)
+; RV32VB-NEXT:    lbu a6, 10(a0)
+; RV32VB-NEXT:    lbu t1, 11(a0)
+; RV32VB-NEXT:    slli a7, a7, 16
+; RV32VB-NEXT:    slli t0, t0, 24
 ; RV32VB-NEXT:    slli a5, a5, 8
+; RV32VB-NEXT:    slli a6, a6, 16
+; RV32VB-NEXT:    slli t1, t1, 24
+; RV32VB-NEXT:    or a7, t0, a7
 ; RV32VB-NEXT:    or a4, a4, a5
-; RV32VB-NEXT:    slli a3, a3, 16
-; RV32VB-NEXT:    slli a6, a6, 24
-; RV32VB-NEXT:    or a3, a6, a3
 ; RV32VB-NEXT:    lbu a5, 12(a0)
-; RV32VB-NEXT:    lbu a6, 13(a0)
-; RV32VB-NEXT:    or a3, a4, a3
-; RV32VB-NEXT:    lbu a4, 14(a0)
+; RV32VB-NEXT:    lbu t0, 13(a0)
+; RV32VB-NEXT:    or a6, t1, a6
+; RV32VB-NEXT:    lbu t1, 14(a0)
 ; RV32VB-NEXT:    lbu a0, 15(a0)
-; RV32VB-NEXT:    slli a6, a6, 8
-; RV32VB-NEXT:    or a5, a5, a6
-; RV32VB-NEXT:    slli a4, a4, 16
+; RV32VB-NEXT:    slli t0, t0, 8
+; RV32VB-NEXT:    or a5, a5, t0
+; RV32VB-NEXT:    slli t1, t1, 16
 ; RV32VB-NEXT:    slli a0, a0, 24
-; RV32VB-NEXT:    or a0, a0, a4
+; RV32VB-NEXT:    or a0, a0, t1
+; RV32VB-NEXT:    or a1, a1, a3
+; RV32VB-NEXT:    or a2, a2, a7
+; RV32VB-NEXT:    or a3, a4, a6
 ; RV32VB-NEXT:    or a0, a5, a0
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a1
@@ -1473,34 +1444,34 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu a2, 1(a0)
 ; RV32VB-PACK-NEXT:    lbu a3, 2(a0)
 ; RV32VB-PACK-NEXT:    lbu a4, 3(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 4(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 5(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 6(a0)
+; RV32VB-PACK-NEXT:    lbu t0, 7(a0)
 ; RV32VB-PACK-NEXT:    packh a1, a1, a2
-; RV32VB-PACK-NEXT:    lbu a2, 4(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 5(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 6(a0)
-; RV32VB-PACK-NEXT:    lbu a7, 7(a0)
+; RV32VB-PACK-NEXT:    lbu a2, 8(a0)
+; RV32VB-PACK-NEXT:    lbu t1, 9(a0)
+; RV32VB-PACK-NEXT:    lbu t2, 10(a0)
+; RV32VB-PACK-NEXT:    lbu t3, 11(a0)
 ; RV32VB-PACK-NEXT:    packh a3, a3, a4
-; RV32VB-PACK-NEXT:    pack a1, a1, a3
-; RV32VB-PACK-NEXT:    packh a2, a2, a5
-; RV32VB-PACK-NEXT:    packh a3, a6, a7
-; RV32VB-PACK-NEXT:    lbu a4, 8(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 9(a0)
-; RV32VB-PACK-NEXT:    pack a2, a2, a3
-; RV32VB-PACK-NEXT:    lbu a3, 10(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 11(a0)
-; RV32VB-PACK-NEXT:    packh a4, a4, a5
-; RV32VB-PACK-NEXT:    lbu a5, 12(a0)
+; RV32VB-PACK-NEXT:    packh a4, a5, a6
+; RV32VB-PACK-NEXT:    packh a5, a7, t0
+; RV32VB-PACK-NEXT:    lbu a6, 12(a0)
 ; RV32VB-PACK-NEXT:    lbu a7, 13(a0)
 ; RV32VB-PACK-NEXT:    lbu t0, 14(a0)
 ; RV32VB-PACK-NEXT:    lbu a0, 15(a0)
-; RV32VB-PACK-NEXT:    packh a3, a3, a6
-; RV32VB-PACK-NEXT:    pack a3, a4, a3
-; RV32VB-PACK-NEXT:    packh a4, a5, a7
+; RV32VB-PACK-NEXT:    packh a2, a2, t1
+; RV32VB-PACK-NEXT:    packh t1, t2, t3
+; RV32VB-PACK-NEXT:    packh a6, a6, a7
 ; RV32VB-PACK-NEXT:    packh a0, t0, a0
-; RV32VB-PACK-NEXT:    pack a0, a4, a0
+; RV32VB-PACK-NEXT:    pack a1, a1, a3
+; RV32VB-PACK-NEXT:    pack a3, a4, a5
+; RV32VB-PACK-NEXT:    pack a2, a2, t1
+; RV32VB-PACK-NEXT:    pack a0, a6, a0
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a1
-; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a3
+; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    ret
 ;
@@ -1522,32 +1493,31 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RV64V-ONLY-NEXT:    lbu t2, 9(a0)
 ; RV64V-ONLY-NEXT:    lbu t3, 10(a0)
 ; RV64V-ONLY-NEXT:    lbu t4, 11(a0)
+; RV64V-ONLY-NEXT:    li t5, 255
+; RV64V-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64V-ONLY-NEXT:    vmv.s.x v0, t5
 ; RV64V-ONLY-NEXT:    lbu t5, 12(a0)
 ; RV64V-ONLY-NEXT:    lbu t6, 13(a0)
 ; RV64V-ONLY-NEXT:    lbu s0, 14(a0)
 ; RV64V-ONLY-NEXT:    lbu a0, 15(a0)
-; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a1
+; RV64V-ONLY-NEXT:    vmv.v.x v9, t1
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a2
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t2
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a3
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t3
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t4
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a5
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t5
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a6
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t6
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a7
-; RV64V-ONLY-NEXT:    vslide1down.vx v9, v8, t0
-; RV64V-ONLY-NEXT:    vmv.v.x v8, t1
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t2
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t3
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t4
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t5
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t6
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, s0
-; RV64V-ONLY-NEXT:    li a1, 255
-; RV64V-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64V-ONLY-NEXT:    vmv.s.x v0, a1
-; RV64V-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a0
-; RV64V-ONLY-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, s0
+; RV64V-ONLY-NEXT:    vslide1down.vx v10, v8, t0
+; RV64V-ONLY-NEXT:    vslide1down.vx v8, v9, a0
+; RV64V-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64V-ONLY-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; RV64V-ONLY-NEXT:    .cfi_restore s0
 ; RV64V-ONLY-NEXT:    addi sp, sp, 16
@@ -1556,52 +1526,52 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ;
 ; RVA22U64-LABEL: buildvec_v16i8_loads_contigous:
 ; RVA22U64:       # %bb.0:
-; RVA22U64-NEXT:    lbu a1, 1(a0)
-; RVA22U64-NEXT:    lbu a2, 0(a0)
+; RVA22U64-NEXT:    lbu a6, 0(a0)
+; RVA22U64-NEXT:    lbu a2, 1(a0)
 ; RVA22U64-NEXT:    lbu a3, 2(a0)
 ; RVA22U64-NEXT:    lbu a4, 3(a0)
-; RVA22U64-NEXT:    slli a1, a1, 8
-; RVA22U64-NEXT:    or a1, a1, a2
+; RVA22U64-NEXT:    lbu a5, 4(a0)
+; RVA22U64-NEXT:    lbu a1, 5(a0)
+; RVA22U64-NEXT:    lbu a7, 6(a0)
+; RVA22U64-NEXT:    lbu t0, 7(a0)
+; RVA22U64-NEXT:    slli a2, a2, 8
 ; RVA22U64-NEXT:    slli a3, a3, 16
 ; RVA22U64-NEXT:    slli a4, a4, 24
-; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    or a1, a1, a3
-; RVA22U64-NEXT:    lbu a2, 4(a0)
-; RVA22U64-NEXT:    lbu a3, 5(a0)
-; RVA22U64-NEXT:    lbu a4, 6(a0)
-; RVA22U64-NEXT:    lbu a5, 7(a0)
-; RVA22U64-NEXT:    slli a2, a2, 32
-; RVA22U64-NEXT:    slli a3, a3, 40
-; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    slli a4, a4, 48
-; RVA22U64-NEXT:    slli a5, a5, 56
-; RVA22U64-NEXT:    or a4, a4, a5
-; RVA22U64-NEXT:    or a2, a2, a4
-; RVA22U64-NEXT:    lbu a3, 8(a0)
-; RVA22U64-NEXT:    lbu a4, 9(a0)
-; RVA22U64-NEXT:    or a1, a1, a2
+; RVA22U64-NEXT:    slli a5, a5, 32
+; RVA22U64-NEXT:    slli a1, a1, 40
+; RVA22U64-NEXT:    or a6, a6, a2
+; RVA22U64-NEXT:    or t2, a4, a3
+; RVA22U64-NEXT:    or t1, a1, a5
+; RVA22U64-NEXT:    lbu a4, 8(a0)
+; RVA22U64-NEXT:    lbu a5, 9(a0)
 ; RVA22U64-NEXT:    lbu a2, 10(a0)
-; RVA22U64-NEXT:    lbu a5, 11(a0)
-; RVA22U64-NEXT:    slli a4, a4, 8
-; RVA22U64-NEXT:    or a3, a3, a4
+; RVA22U64-NEXT:    lbu a1, 11(a0)
+; RVA22U64-NEXT:    slli a7, a7, 48
+; RVA22U64-NEXT:    slli t0, t0, 56
+; RVA22U64-NEXT:    slli a5, a5, 8
 ; RVA22U64-NEXT:    slli a2, a2, 16
-; RVA22U64-NEXT:    slli a5, a5, 24
-; RVA22U64-NEXT:    or a2, a2, a5
-; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    lbu a3, 12(a0)
-; RVA22U64-NEXT:    lbu a4, 13(a0)
-; RVA22U64-NEXT:    lbu a5, 14(a0)
+; RVA22U64-NEXT:    slli a1, a1, 24
+; RVA22U64-NEXT:    or a7, t0, a7
+; RVA22U64-NEXT:    or a4, a4, a5
+; RVA22U64-NEXT:    or a1, a1, a2
+; RVA22U64-NEXT:    lbu a2, 12(a0)
+; RVA22U64-NEXT:    lbu a5, 13(a0)
+; RVA22U64-NEXT:    lbu a3, 14(a0)
 ; RVA22U64-NEXT:    lbu a0, 15(a0)
-; RVA22U64-NEXT:    slli a3, a3, 32
-; RVA22U64-NEXT:    slli a4, a4, 40
-; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    slli a5, a5, 48
+; RVA22U64-NEXT:    slli a2, a2, 32
+; RVA22U64-NEXT:    slli a5, a5, 40
+; RVA22U64-NEXT:    or a2, a2, a5
+; RVA22U64-NEXT:    slli a3, a3, 48
 ; RVA22U64-NEXT:    slli a0, a0, 56
-; RVA22U64-NEXT:    or a0, a0, a5
 ; RVA22U64-NEXT:    or a0, a0, a3
+; RVA22U64-NEXT:    or a3, a6, t2
+; RVA22U64-NEXT:    or a5, a7, t1
+; RVA22U64-NEXT:    or a1, a1, a4
 ; RVA22U64-NEXT:    or a0, a0, a2
+; RVA22U64-NEXT:    or a3, a3, a5
+; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-NEXT:    vmv.v.x v8, a1
+; RVA22U64-NEXT:    vmv.v.x v8, a3
 ; RVA22U64-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-NEXT:    ret
 ;
@@ -1610,35 +1580,35 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RVA22U64-PACK-NEXT:    lbu a1, 0(a0)
 ; RVA22U64-PACK-NEXT:    lbu a2, 1(a0)
 ; RVA22U64-PACK-NEXT:    lbu a6, 2(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 3(a0)
-; RVA22U64-PACK-NEXT:    packh a7, a1, a2
-; RVA22U64-PACK-NEXT:    lbu a2, 4(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 5(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 6(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 7(a0)
-; RVA22U64-PACK-NEXT:    packh a4, a6, a4
-; RVA22U64-PACK-NEXT:    packw a4, a7, a4
-; RVA22U64-PACK-NEXT:    packh a2, a2, a5
-; RVA22U64-PACK-NEXT:    packh a1, a3, a1
-; RVA22U64-PACK-NEXT:    packw a1, a2, a1
-; RVA22U64-PACK-NEXT:    lbu a2, 8(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 9(a0)
-; RVA22U64-PACK-NEXT:    pack a6, a4, a1
-; RVA22U64-PACK-NEXT:    lbu a7, 10(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 11(a0)
-; RVA22U64-PACK-NEXT:    packh a2, a2, a3
-; RVA22U64-PACK-NEXT:    lbu a3, 12(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 13(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 14(a0)
+; RVA22U64-PACK-NEXT:    lbu a7, 3(a0)
+; RVA22U64-PACK-NEXT:    lbu t0, 4(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 5(a0)
+; RVA22U64-PACK-NEXT:    lbu a4, 6(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 7(a0)
+; RVA22U64-PACK-NEXT:    packh t1, a1, a2
+; RVA22U64-PACK-NEXT:    lbu t2, 8(a0)
+; RVA22U64-PACK-NEXT:    lbu t3, 9(a0)
+; RVA22U64-PACK-NEXT:    lbu t4, 10(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 11(a0)
+; RVA22U64-PACK-NEXT:    packh a6, a6, a7
+; RVA22U64-PACK-NEXT:    packh a7, t0, a3
+; RVA22U64-PACK-NEXT:    packh t0, a4, a5
+; RVA22U64-PACK-NEXT:    lbu a5, 12(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 13(a0)
+; RVA22U64-PACK-NEXT:    lbu a2, 14(a0)
 ; RVA22U64-PACK-NEXT:    lbu a0, 15(a0)
-; RVA22U64-PACK-NEXT:    packh a5, a7, a5
-; RVA22U64-PACK-NEXT:    packw a2, a2, a5
-; RVA22U64-PACK-NEXT:    packh a1, a3, a1
-; RVA22U64-PACK-NEXT:    packh a0, a4, a0
-; RVA22U64-PACK-NEXT:    packw a0, a1, a0
-; RVA22U64-PACK-NEXT:    pack a0, a2, a0
+; RVA22U64-PACK-NEXT:    packh a4, t2, t3
+; RVA22U64-PACK-NEXT:    packh a1, t4, a1
+; RVA22U64-PACK-NEXT:    packh a3, a5, a3
+; RVA22U64-PACK-NEXT:    packh a0, a2, a0
+; RVA22U64-PACK-NEXT:    packw a2, t1, a6
+; RVA22U64-PACK-NEXT:    packw a5, a7, t0
+; RVA22U64-PACK-NEXT:    packw a1, a4, a1
+; RVA22U64-PACK-NEXT:    packw a0, a3, a0
+; RVA22U64-PACK-NEXT:    pack a2, a2, a5
+; RVA22U64-PACK-NEXT:    pack a0, a1, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-PACK-NEXT:    vmv.v.x v8, a6
+; RVA22U64-PACK-NEXT:    vmv.v.x v8, a2
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-PACK-NEXT:    ret
 ;
@@ -1660,32 +1630,31 @@ define <16 x i8> @buildvec_v16i8_loads_contigous(ptr %p) {
 ; RV64ZVE32-NEXT:    lbu t2, 9(a0)
 ; RV64ZVE32-NEXT:    lbu t3, 10(a0)
 ; RV64ZVE32-NEXT:    lbu t4, 11(a0)
+; RV64ZVE32-NEXT:    li t5, 255
+; RV64ZVE32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32-NEXT:    vmv.s.x v0, t5
 ; RV64ZVE32-NEXT:    lbu t5, 12(a0)
 ; RV64ZVE32-NEXT:    lbu t6, 13(a0)
 ; RV64ZVE32-NEXT:    lbu s0, 14(a0)
 ; RV64ZVE32-NEXT:    lbu a0, 15(a0)
-; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32-NEXT:    vmv.v.x v9, t1
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t2
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t3
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t4
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a5
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t5
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t6
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32-NEXT:    vslide1down.vx v9, v8, t0
-; RV64ZVE32-NEXT:    vmv.v.x v8, t1
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t2
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t3
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t4
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t5
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t6
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, s0
-; RV64ZVE32-NEXT:    li a1, 255
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32-NEXT:    vmv.s.x v0, a1
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, s0
+; RV64ZVE32-NEXT:    vslide1down.vx v10, v8, t0
+; RV64ZVE32-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64ZVE32-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; RV64ZVE32-NEXT:    .cfi_restore s0
 ; RV64ZVE32-NEXT:    addi sp, sp, 16
@@ -1763,32 +1732,31 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV32-ONLY-NEXT:    lbu t2, 154(a0)
 ; RV32-ONLY-NEXT:    lbu t3, 161(a0)
 ; RV32-ONLY-NEXT:    lbu t4, 163(a0)
+; RV32-ONLY-NEXT:    li t5, 255
+; RV32-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV32-ONLY-NEXT:    vmv.s.x v0, t5
 ; RV32-ONLY-NEXT:    lbu t5, 93(a0)
 ; RV32-ONLY-NEXT:    lbu t6, 105(a0)
 ; RV32-ONLY-NEXT:    lbu s0, 124(a0)
 ; RV32-ONLY-NEXT:    lbu a0, 144(a0)
-; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a1
+; RV32-ONLY-NEXT:    vmv.v.x v9, t1
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t5
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t6
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t3
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a6
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, s0
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a7
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t4
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a5
-; RV32-ONLY-NEXT:    vslide1down.vx v9, v8, t0
-; RV32-ONLY-NEXT:    vmv.v.x v8, t1
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t5
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t6
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t3
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, s0
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t4
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-ONLY-NEXT:    li a0, 255
-; RV32-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-ONLY-NEXT:    vmv.s.x v0, a0
-; RV32-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t2
-; RV32-ONLY-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, a0
+; RV32-ONLY-NEXT:    vslide1down.vx v10, v8, t0
+; RV32-ONLY-NEXT:    vslide1down.vx v8, v9, t2
+; RV32-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV32-ONLY-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32-ONLY-NEXT:    .cfi_restore s0
 ; RV32-ONLY-NEXT:    addi sp, sp, 16
@@ -1797,50 +1765,50 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ;
 ; RV32VB-LABEL: buildvec_v16i8_loads_gather:
 ; RV32VB:       # %bb.0:
-; RV32VB-NEXT:    lbu a1, 1(a0)
-; RV32VB-NEXT:    lbu a2, 0(a0)
+; RV32VB-NEXT:    lbu a1, 0(a0)
+; RV32VB-NEXT:    lbu a2, 1(a0)
 ; RV32VB-NEXT:    lbu a3, 22(a0)
 ; RV32VB-NEXT:    lbu a4, 31(a0)
-; RV32VB-NEXT:    slli a1, a1, 8
-; RV32VB-NEXT:    or a1, a2, a1
+; RV32VB-NEXT:    lbu a5, 623(a0)
+; RV32VB-NEXT:    lbu a6, 44(a0)
+; RV32VB-NEXT:    lbu a7, 55(a0)
+; RV32VB-NEXT:    lbu t0, 75(a0)
+; RV32VB-NEXT:    lbu t1, 82(a0)
+; RV32VB-NEXT:    slli a2, a2, 8
 ; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    slli a4, a4, 24
+; RV32VB-NEXT:    or a1, a1, a2
 ; RV32VB-NEXT:    or a3, a4, a3
-; RV32VB-NEXT:    or a1, a1, a3
-; RV32VB-NEXT:    lbu a2, 44(a0)
-; RV32VB-NEXT:    lbu a3, 55(a0)
-; RV32VB-NEXT:    lbu a4, 623(a0)
-; RV32VB-NEXT:    lbu a5, 75(a0)
-; RV32VB-NEXT:    lbu a6, 82(a0)
-; RV32VB-NEXT:    slli a3, a3, 8
-; RV32VB-NEXT:    or a2, a2, a3
-; RV32VB-NEXT:    slli a4, a4, 16
-; RV32VB-NEXT:    slli a5, a5, 24
-; RV32VB-NEXT:    or a4, a5, a4
-; RV32VB-NEXT:    or a2, a2, a4
-; RV32VB-NEXT:    lbu a3, 93(a0)
+; RV32VB-NEXT:    lbu a2, 93(a0)
 ; RV32VB-NEXT:    lbu a4, 105(a0)
-; RV32VB-NEXT:    lbu a5, 124(a0)
-; RV32VB-NEXT:    lbu a7, 144(a0)
-; RV32VB-NEXT:    slli a3, a3, 8
-; RV32VB-NEXT:    lbu t0, 154(a0)
-; RV32VB-NEXT:    lbu t1, 161(a0)
-; RV32VB-NEXT:    or a3, a6, a3
-; RV32VB-NEXT:    slli a4, a4, 16
+; RV32VB-NEXT:    lbu t2, 124(a0)
+; RV32VB-NEXT:    lbu t3, 144(a0)
+; RV32VB-NEXT:    slli a7, a7, 8
+; RV32VB-NEXT:    slli a5, a5, 16
+; RV32VB-NEXT:    slli t0, t0, 24
+; RV32VB-NEXT:    slli a2, a2, 8
+; RV32VB-NEXT:    or a6, a6, a7
+; RV32VB-NEXT:    or a5, t0, a5
+; RV32VB-NEXT:    lbu a7, 154(a0)
+; RV32VB-NEXT:    lbu t0, 161(a0)
+; RV32VB-NEXT:    or a2, t1, a2
 ; RV32VB-NEXT:    lbu a0, 163(a0)
-; RV32VB-NEXT:    slli t1, t1, 24
-; RV32VB-NEXT:    or a4, t1, a4
-; RV32VB-NEXT:    or a3, a3, a4
-; RV32VB-NEXT:    slli a0, a0, 8
-; RV32VB-NEXT:    or a0, a5, a0
-; RV32VB-NEXT:    slli a7, a7, 16
+; RV32VB-NEXT:    slli a4, a4, 16
 ; RV32VB-NEXT:    slli t0, t0, 24
-; RV32VB-NEXT:    or a4, t0, a7
-; RV32VB-NEXT:    or a0, a0, a4
+; RV32VB-NEXT:    or a4, t0, a4
+; RV32VB-NEXT:    slli a0, a0, 8
+; RV32VB-NEXT:    or a0, t2, a0
+; RV32VB-NEXT:    slli t3, t3, 16
+; RV32VB-NEXT:    slli a7, a7, 24
+; RV32VB-NEXT:    or a7, a7, t3
+; RV32VB-NEXT:    or a1, a1, a3
+; RV32VB-NEXT:    or a3, a6, a5
+; RV32VB-NEXT:    or a2, a2, a4
+; RV32VB-NEXT:    or a0, a0, a7
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a1
-; RV32VB-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a3
+; RV32VB-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-NEXT:    ret
 ;
@@ -1850,34 +1818,34 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu a2, 1(a0)
 ; RV32VB-PACK-NEXT:    lbu a3, 22(a0)
 ; RV32VB-PACK-NEXT:    lbu a4, 31(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 623(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 44(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 55(a0)
+; RV32VB-PACK-NEXT:    lbu t0, 75(a0)
+; RV32VB-PACK-NEXT:    lbu t1, 82(a0)
 ; RV32VB-PACK-NEXT:    packh a1, a1, a2
-; RV32VB-PACK-NEXT:    packh a2, a3, a4
-; RV32VB-PACK-NEXT:    lbu a3, 623(a0)
-; RV32VB-PACK-NEXT:    lbu a4, 44(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 55(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 75(a0)
-; RV32VB-PACK-NEXT:    pack a1, a1, a2
-; RV32VB-PACK-NEXT:    lbu a2, 82(a0)
-; RV32VB-PACK-NEXT:    packh a4, a4, a5
-; RV32VB-PACK-NEXT:    packh a3, a3, a6
-; RV32VB-PACK-NEXT:    pack a3, a4, a3
-; RV32VB-PACK-NEXT:    lbu a4, 154(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 161(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 163(a0)
-; RV32VB-PACK-NEXT:    lbu a7, 93(a0)
-; RV32VB-PACK-NEXT:    lbu t0, 105(a0)
-; RV32VB-PACK-NEXT:    lbu t1, 124(a0)
+; RV32VB-PACK-NEXT:    lbu a2, 154(a0)
+; RV32VB-PACK-NEXT:    lbu t2, 161(a0)
+; RV32VB-PACK-NEXT:    lbu t3, 163(a0)
+; RV32VB-PACK-NEXT:    packh a3, a3, a4
+; RV32VB-PACK-NEXT:    packh a4, a6, a7
+; RV32VB-PACK-NEXT:    packh a5, a5, t0
+; RV32VB-PACK-NEXT:    lbu a6, 93(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 105(a0)
+; RV32VB-PACK-NEXT:    lbu t0, 124(a0)
 ; RV32VB-PACK-NEXT:    lbu a0, 144(a0)
-; RV32VB-PACK-NEXT:    packh a2, a2, a7
-; RV32VB-PACK-NEXT:    packh a5, t0, a5
-; RV32VB-PACK-NEXT:    pack a2, a2, a5
-; RV32VB-PACK-NEXT:    packh a5, t1, a6
-; RV32VB-PACK-NEXT:    packh a0, a0, a4
-; RV32VB-PACK-NEXT:    pack a0, a5, a0
+; RV32VB-PACK-NEXT:    packh a6, t1, a6
+; RV32VB-PACK-NEXT:    packh a7, a7, t2
+; RV32VB-PACK-NEXT:    packh t0, t0, t3
+; RV32VB-PACK-NEXT:    packh a0, a0, a2
+; RV32VB-PACK-NEXT:    pack a1, a1, a3
+; RV32VB-PACK-NEXT:    pack a2, a4, a5
+; RV32VB-PACK-NEXT:    pack a3, a6, a7
+; RV32VB-PACK-NEXT:    pack a0, t0, a0
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a1
-; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a2
+; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    ret
 ;
@@ -1899,32 +1867,31 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV64V-ONLY-NEXT:    lbu t2, 154(a0)
 ; RV64V-ONLY-NEXT:    lbu t3, 161(a0)
 ; RV64V-ONLY-NEXT:    lbu t4, 163(a0)
+; RV64V-ONLY-NEXT:    li t5, 255
+; RV64V-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64V-ONLY-NEXT:    vmv.s.x v0, t5
 ; RV64V-ONLY-NEXT:    lbu t5, 93(a0)
 ; RV64V-ONLY-NEXT:    lbu t6, 105(a0)
 ; RV64V-ONLY-NEXT:    lbu s0, 124(a0)
 ; RV64V-ONLY-NEXT:    lbu a0, 144(a0)
-; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a1
+; RV64V-ONLY-NEXT:    vmv.v.x v9, t1
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a2
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t5
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a3
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t6
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t3
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a6
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, s0
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a7
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t4
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a5
-; RV64V-ONLY-NEXT:    vslide1down.vx v9, v8, t0
-; RV64V-ONLY-NEXT:    vmv.v.x v8, t1
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t5
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t6
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t3
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, s0
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t4
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a0
-; RV64V-ONLY-NEXT:    li a0, 255
-; RV64V-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64V-ONLY-NEXT:    vmv.s.x v0, a0
-; RV64V-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t2
-; RV64V-ONLY-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, a0
+; RV64V-ONLY-NEXT:    vslide1down.vx v10, v8, t0
+; RV64V-ONLY-NEXT:    vslide1down.vx v8, v9, t2
+; RV64V-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64V-ONLY-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; RV64V-ONLY-NEXT:    .cfi_restore s0
 ; RV64V-ONLY-NEXT:    addi sp, sp, 16
@@ -1933,90 +1900,98 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ;
 ; RVA22U64-LABEL: buildvec_v16i8_loads_gather:
 ; RVA22U64:       # %bb.0:
-; RVA22U64-NEXT:    lbu a1, 1(a0)
-; RVA22U64-NEXT:    lbu a2, 0(a0)
+; RVA22U64-NEXT:    lbu a1, 0(a0)
+; RVA22U64-NEXT:    lbu a2, 1(a0)
 ; RVA22U64-NEXT:    lbu a3, 22(a0)
 ; RVA22U64-NEXT:    lbu a4, 31(a0)
-; RVA22U64-NEXT:    slli a1, a1, 8
-; RVA22U64-NEXT:    or a1, a1, a2
+; RVA22U64-NEXT:    lbu a6, 623(a0)
+; RVA22U64-NEXT:    lbu t0, 44(a0)
+; RVA22U64-NEXT:    lbu a7, 55(a0)
+; RVA22U64-NEXT:    lbu a5, 75(a0)
+; RVA22U64-NEXT:    lbu t1, 82(a0)
+; RVA22U64-NEXT:    slli a2, a2, 8
 ; RVA22U64-NEXT:    slli a3, a3, 16
 ; RVA22U64-NEXT:    slli a4, a4, 24
-; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    or a1, a1, a3
-; RVA22U64-NEXT:    lbu a2, 623(a0)
-; RVA22U64-NEXT:    lbu a3, 44(a0)
-; RVA22U64-NEXT:    lbu a4, 55(a0)
-; RVA22U64-NEXT:    lbu a5, 75(a0)
-; RVA22U64-NEXT:    lbu a6, 82(a0)
-; RVA22U64-NEXT:    slli a3, a3, 32
-; RVA22U64-NEXT:    slli a4, a4, 40
-; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    slli a2, a2, 48
-; RVA22U64-NEXT:    slli a5, a5, 56
-; RVA22U64-NEXT:    or a2, a2, a5
-; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    or a7, a1, a2
+; RVA22U64-NEXT:    or t2, a1, a2
+; RVA22U64-NEXT:    or t3, a4, a3
 ; RVA22U64-NEXT:    lbu a2, 93(a0)
-; RVA22U64-NEXT:    lbu t0, 105(a0)
-; RVA22U64-NEXT:    lbu a4, 124(a0)
-; RVA22U64-NEXT:    lbu a5, 144(a0)
+; RVA22U64-NEXT:    lbu t4, 105(a0)
+; RVA22U64-NEXT:    lbu t6, 124(a0)
+; RVA22U64-NEXT:    lbu t5, 144(a0)
+; RVA22U64-NEXT:    slli t0, t0, 32
+; RVA22U64-NEXT:    slli a7, a7, 40
+; RVA22U64-NEXT:    slli a6, a6, 48
+; RVA22U64-NEXT:    slli a5, a5, 56
 ; RVA22U64-NEXT:    slli a2, a2, 8
-; RVA22U64-NEXT:    lbu a1, 154(a0)
-; RVA22U64-NEXT:    lbu a3, 161(a0)
-; RVA22U64-NEXT:    or a2, a6, a2
-; RVA22U64-NEXT:    slli t0, t0, 16
+; RVA22U64-NEXT:    or a7, a7, t0
+; RVA22U64-NEXT:    or a5, a5, a6
+; RVA22U64-NEXT:    lbu a3, 154(a0)
+; RVA22U64-NEXT:    lbu a1, 161(a0)
+; RVA22U64-NEXT:    or a2, t1, a2
 ; RVA22U64-NEXT:    lbu a0, 163(a0)
-; RVA22U64-NEXT:    slli a3, a3, 24
-; RVA22U64-NEXT:    or a3, a3, t0
-; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    slli a4, a4, 32
+; RVA22U64-NEXT:    slli t4, t4, 16
+; RVA22U64-NEXT:    slli a1, a1, 24
+; RVA22U64-NEXT:    or a1, a1, t4
+; RVA22U64-NEXT:    slli t6, t6, 32
 ; RVA22U64-NEXT:    slli a0, a0, 40
-; RVA22U64-NEXT:    or a0, a0, a4
-; RVA22U64-NEXT:    slli a5, a5, 48
-; RVA22U64-NEXT:    slli a1, a1, 56
-; RVA22U64-NEXT:    or a1, a1, a5
+; RVA22U64-NEXT:    or a0, a0, t6
+; RVA22U64-NEXT:    slli t5, t5, 48
+; RVA22U64-NEXT:    slli a3, a3, 56
+; RVA22U64-NEXT:    or a3, a3, t5
+; RVA22U64-NEXT:    or a4, t2, t3
+; RVA22U64-NEXT:    or a5, a5, a7
+; RVA22U64-NEXT:    or a1, a1, a2
+; RVA22U64-NEXT:    or a0, a0, a3
+; RVA22U64-NEXT:    or a4, a4, a5
 ; RVA22U64-NEXT:    or a0, a0, a1
-; RVA22U64-NEXT:    or a0, a0, a2
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-NEXT:    vmv.v.x v8, a7
+; RVA22U64-NEXT:    vmv.v.x v8, a4
 ; RVA22U64-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-NEXT:    ret
 ;
 ; RVA22U64-PACK-LABEL: buildvec_v16i8_loads_gather:
 ; RVA22U64-PACK:       # %bb.0:
+; RVA22U64-PACK-NEXT:    addi sp, sp, -16
+; RVA22U64-PACK-NEXT:    .cfi_def_cfa_offset 16
+; RVA22U64-PACK-NEXT:    sd s0, 8(sp) # 8-byte Folded Spill
+; RVA22U64-PACK-NEXT:    .cfi_offset s0, -8
 ; RVA22U64-PACK-NEXT:    lbu a1, 0(a0)
 ; RVA22U64-PACK-NEXT:    lbu a2, 1(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 22(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 31(a0)
-; RVA22U64-PACK-NEXT:    packh a6, a1, a2
-; RVA22U64-PACK-NEXT:    packh a2, a3, a4
-; RVA22U64-PACK-NEXT:    lbu a3, 623(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 44(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 55(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 75(a0)
-; RVA22U64-PACK-NEXT:    packw a2, a6, a2
-; RVA22U64-PACK-NEXT:    lbu a6, 82(a0)
-; RVA22U64-PACK-NEXT:    packh a4, a4, a5
-; RVA22U64-PACK-NEXT:    packh a1, a3, a1
-; RVA22U64-PACK-NEXT:    packw a1, a4, a1
-; RVA22U64-PACK-NEXT:    pack a7, a2, a1
-; RVA22U64-PACK-NEXT:    lbu t0, 154(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 161(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 163(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 93(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 105(a0)
-; RVA22U64-PACK-NEXT:    lbu a2, 124(a0)
+; RVA22U64-PACK-NEXT:    lbu a6, 22(a0)
+; RVA22U64-PACK-NEXT:    lbu a7, 31(a0)
+; RVA22U64-PACK-NEXT:    lbu t0, 623(a0)
+; RVA22U64-PACK-NEXT:    lbu t3, 44(a0)
+; RVA22U64-PACK-NEXT:    lbu t4, 55(a0)
+; RVA22U64-PACK-NEXT:    lbu t5, 75(a0)
+; RVA22U64-PACK-NEXT:    lbu t1, 82(a0)
+; RVA22U64-PACK-NEXT:    packh t2, a1, a2
+; RVA22U64-PACK-NEXT:    lbu t6, 154(a0)
+; RVA22U64-PACK-NEXT:    lbu s0, 161(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 163(a0)
+; RVA22U64-PACK-NEXT:    packh a6, a6, a7
+; RVA22U64-PACK-NEXT:    packh a7, t3, t4
+; RVA22U64-PACK-NEXT:    packh a2, t0, t5
+; RVA22U64-PACK-NEXT:    lbu a4, 93(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 105(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 124(a0)
 ; RVA22U64-PACK-NEXT:    lbu a0, 144(a0)
-; RVA22U64-PACK-NEXT:    packh a5, a6, a5
+; RVA22U64-PACK-NEXT:    packh a4, t1, a4
+; RVA22U64-PACK-NEXT:    packh a5, a5, s0
 ; RVA22U64-PACK-NEXT:    packh a1, a1, a3
-; RVA22U64-PACK-NEXT:    packw a1, a5, a1
-; RVA22U64-PACK-NEXT:    packh a2, a2, a4
-; RVA22U64-PACK-NEXT:    packh a0, a0, t0
-; RVA22U64-PACK-NEXT:    packw a0, a2, a0
-; RVA22U64-PACK-NEXT:    pack a0, a1, a0
+; RVA22U64-PACK-NEXT:    packh a0, a0, t6
+; RVA22U64-PACK-NEXT:    packw a3, t2, a6
+; RVA22U64-PACK-NEXT:    packw a2, a7, a2
+; RVA22U64-PACK-NEXT:    packw a4, a4, a5
+; RVA22U64-PACK-NEXT:    packw a0, a1, a0
+; RVA22U64-PACK-NEXT:    pack a1, a3, a2
+; RVA22U64-PACK-NEXT:    pack a0, a4, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-PACK-NEXT:    vmv.v.x v8, a7
+; RVA22U64-PACK-NEXT:    vmv.v.x v8, a1
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
+; RVA22U64-PACK-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
+; RVA22U64-PACK-NEXT:    .cfi_restore s0
+; RVA22U64-PACK-NEXT:    addi sp, sp, 16
+; RVA22U64-PACK-NEXT:    .cfi_def_cfa_offset 0
 ; RVA22U64-PACK-NEXT:    ret
 ;
 ; RV64ZVE32-LABEL: buildvec_v16i8_loads_gather:
@@ -2037,32 +2012,31 @@ define <16 x i8> @buildvec_v16i8_loads_gather(ptr %p) {
 ; RV64ZVE32-NEXT:    lbu t2, 154(a0)
 ; RV64ZVE32-NEXT:    lbu t3, 161(a0)
 ; RV64ZVE32-NEXT:    lbu t4, 163(a0)
+; RV64ZVE32-NEXT:    li t5, 255
+; RV64ZVE32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32-NEXT:    vmv.s.x v0, t5
 ; RV64ZVE32-NEXT:    lbu t5, 93(a0)
 ; RV64ZVE32-NEXT:    lbu t6, 105(a0)
 ; RV64ZVE32-NEXT:    lbu s0, 124(a0)
 ; RV64ZVE32-NEXT:    lbu a0, 144(a0)
-; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32-NEXT:    vmv.v.x v9, t1
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t5
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t6
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t3
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, s0
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a7
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t4
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32-NEXT:    vslide1down.vx v9, v8, t0
-; RV64ZVE32-NEXT:    vmv.v.x v8, t1
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t5
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t6
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t3
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, s0
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t4
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32-NEXT:    li a0, 255
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32-NEXT:    vmv.s.x v0, a0
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t2
-; RV64ZVE32-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, a0
+; RV64ZVE32-NEXT:    vslide1down.vx v10, v8, t0
+; RV64ZVE32-NEXT:    vslide1down.vx v8, v9, t2
+; RV64ZVE32-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64ZVE32-NEXT:    ld s0, 8(sp) # 8-byte Folded Reload
 ; RV64ZVE32-NEXT:    .cfi_restore s0
 ; RV64ZVE32-NEXT:    addi sp, sp, 16
@@ -2153,19 +2127,19 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ; RV32VB-NEXT:    lbu a6, 154(a0)
 ; RV32VB-NEXT:    lbu a7, 161(a0)
 ; RV32VB-NEXT:    or a1, a2, a1
-; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    lbu a0, 163(a0)
+; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    slli a7, a7, 24
 ; RV32VB-NEXT:    or a2, a7, a3
-; RV32VB-NEXT:    or a1, a1, a2
 ; RV32VB-NEXT:    slli a0, a0, 8
 ; RV32VB-NEXT:    or a0, a4, a0
 ; RV32VB-NEXT:    slli a5, a5, 16
 ; RV32VB-NEXT:    slli a6, a6, 24
-; RV32VB-NEXT:    or a2, a6, a5
-; RV32VB-NEXT:    or a0, a0, a2
+; RV32VB-NEXT:    or a3, a6, a5
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.i v8, 0
+; RV32VB-NEXT:    or a1, a1, a2
+; RV32VB-NEXT:    or a0, a0, a3
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, zero
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a0
@@ -2173,26 +2147,26 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ;
 ; RV32VB-PACK-LABEL: buildvec_v16i8_undef_low_half:
 ; RV32VB-PACK:       # %bb.0:
-; RV32VB-PACK-NEXT:    lbu a1, 144(a0)
-; RV32VB-PACK-NEXT:    lbu a2, 154(a0)
-; RV32VB-PACK-NEXT:    lbu a3, 161(a0)
-; RV32VB-PACK-NEXT:    lbu a4, 82(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 93(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 105(a0)
-; RV32VB-PACK-NEXT:    lbu a7, 124(a0)
-; RV32VB-PACK-NEXT:    lbu a0, 163(a0)
-; RV32VB-PACK-NEXT:    packh a4, a4, a5
-; RV32VB-PACK-NEXT:    packh a3, a6, a3
-; RV32VB-PACK-NEXT:    pack a3, a4, a3
-; RV32VB-PACK-NEXT:    packh a0, a7, a0
+; RV32VB-PACK-NEXT:    lbu a1, 82(a0)
+; RV32VB-PACK-NEXT:    lbu a2, 93(a0)
+; RV32VB-PACK-NEXT:    lbu a3, 105(a0)
+; RV32VB-PACK-NEXT:    lbu a4, 124(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 161(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 163(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 144(a0)
+; RV32VB-PACK-NEXT:    lbu a0, 154(a0)
 ; RV32VB-PACK-NEXT:    packh a1, a1, a2
-; RV32VB-PACK-NEXT:    pack a0, a0, a1
-; RV32VB-PACK-NEXT:    packh a1, a0, a0
-; RV32VB-PACK-NEXT:    pack a1, a1, a1
+; RV32VB-PACK-NEXT:    packh a2, a3, a5
+; RV32VB-PACK-NEXT:    packh a3, a4, a6
+; RV32VB-PACK-NEXT:    packh a0, a7, a0
+; RV32VB-PACK-NEXT:    pack a1, a1, a2
+; RV32VB-PACK-NEXT:    packh a2, a0, a0
+; RV32VB-PACK-NEXT:    pack a2, a2, a2
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32VB-PACK-NEXT:    vmv.v.x v8, a1
+; RV32VB-PACK-NEXT:    vmv.v.x v8, a2
+; RV32VB-PACK-NEXT:    pack a0, a3, a0
+; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a1
-; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    ret
 ;
@@ -2228,17 +2202,17 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ; RVA22U64-NEXT:    lbu a2, 154(a0)
 ; RVA22U64-NEXT:    lbu a3, 161(a0)
 ; RVA22U64-NEXT:    or a1, a6, a1
-; RVA22U64-NEXT:    slli a7, a7, 16
 ; RVA22U64-NEXT:    lbu a0, 163(a0)
+; RVA22U64-NEXT:    slli a7, a7, 16
 ; RVA22U64-NEXT:    slli a3, a3, 24
 ; RVA22U64-NEXT:    or a3, a3, a7
-; RVA22U64-NEXT:    or a1, a1, a3
 ; RVA22U64-NEXT:    slli a4, a4, 32
 ; RVA22U64-NEXT:    slli a0, a0, 40
 ; RVA22U64-NEXT:    or a0, a0, a4
 ; RVA22U64-NEXT:    slli a5, a5, 48
 ; RVA22U64-NEXT:    slli a2, a2, 56
 ; RVA22U64-NEXT:    or a2, a2, a5
+; RVA22U64-NEXT:    or a1, a1, a3
 ; RVA22U64-NEXT:    or a0, a0, a2
 ; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
@@ -2248,26 +2222,26 @@ define <16 x i8> @buildvec_v16i8_undef_low_half(ptr %p) {
 ;
 ; RVA22U64-PACK-LABEL: buildvec_v16i8_undef_low_half:
 ; RVA22U64-PACK:       # %bb.0:
-; RVA22U64-PACK-NEXT:    lbu a6, 144(a0)
-; RVA22U64-PACK-NEXT:    lbu a7, 154(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 161(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 82(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 93(a0)
-; RVA22U64-PACK-NEXT:    lbu a1, 105(a0)
-; RVA22U64-PACK-NEXT:    lbu a2, 124(a0)
-; RVA22U64-PACK-NEXT:    lbu a0, 163(a0)
-; RVA22U64-PACK-NEXT:    packh a4, a4, a5
-; RVA22U64-PACK-NEXT:    packh a1, a1, a3
-; RVA22U64-PACK-NEXT:    packw a1, a4, a1
+; RVA22U64-PACK-NEXT:    lbu a6, 82(a0)
+; RVA22U64-PACK-NEXT:    lbu a7, 93(a0)
+; RVA22U64-PACK-NEXT:    lbu t0, 105(a0)
+; RVA22U64-PACK-NEXT:    lbu a4, 124(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 161(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 163(a0)
+; RVA22U64-PACK-NEXT:    lbu a2, 144(a0)
+; RVA22U64-PACK-NEXT:    lbu a0, 154(a0)
+; RVA22U64-PACK-NEXT:    packh a3, a6, a7
+; RVA22U64-PACK-NEXT:    packh a5, t0, a5
+; RVA22U64-PACK-NEXT:    packh a1, a4, a1
 ; RVA22U64-PACK-NEXT:    packh a0, a2, a0
-; RVA22U64-PACK-NEXT:    packh a2, a6, a7
-; RVA22U64-PACK-NEXT:    packw a0, a0, a2
-; RVA22U64-PACK-NEXT:    pack a0, a1, a0
-; RVA22U64-PACK-NEXT:    packh a1, a0, a0
-; RVA22U64-PACK-NEXT:    packw a1, a1, a1
-; RVA22U64-PACK-NEXT:    pack a1, a1, a1
+; RVA22U64-PACK-NEXT:    packw a2, a3, a5
+; RVA22U64-PACK-NEXT:    packh a3, a0, a0
+; RVA22U64-PACK-NEXT:    packw a3, a3, a3
+; RVA22U64-PACK-NEXT:    pack a3, a3, a3
+; RVA22U64-PACK-NEXT:    packw a0, a1, a0
+; RVA22U64-PACK-NEXT:    pack a0, a2, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-PACK-NEXT:    vmv.v.x v8, a1
+; RVA22U64-PACK-NEXT:    vmv.v.x v8, a3
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-PACK-NEXT:    ret
 ;
@@ -2346,25 +2320,25 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ; RV32VB-LABEL: buildvec_v16i8_undef_high_half:
 ; RV32VB:       # %bb.0:
 ; RV32VB-NEXT:    lbu a1, 1(a0)
-; RV32VB-NEXT:    lbu a2, 0(a0)
-; RV32VB-NEXT:    lbu a3, 22(a0)
-; RV32VB-NEXT:    lbu a4, 31(a0)
+; RV32VB-NEXT:    lbu a2, 22(a0)
+; RV32VB-NEXT:    lbu a3, 31(a0)
+; RV32VB-NEXT:    lbu a4, 0(a0)
 ; RV32VB-NEXT:    slli a1, a1, 8
-; RV32VB-NEXT:    or a1, a2, a1
-; RV32VB-NEXT:    slli a3, a3, 16
-; RV32VB-NEXT:    slli a4, a4, 24
-; RV32VB-NEXT:    or a3, a4, a3
-; RV32VB-NEXT:    lbu a2, 44(a0)
-; RV32VB-NEXT:    lbu a4, 55(a0)
-; RV32VB-NEXT:    or a1, a1, a3
+; RV32VB-NEXT:    slli a2, a2, 16
+; RV32VB-NEXT:    slli a3, a3, 24
+; RV32VB-NEXT:    or a1, a4, a1
+; RV32VB-NEXT:    lbu a4, 44(a0)
+; RV32VB-NEXT:    lbu a5, 55(a0)
+; RV32VB-NEXT:    or a2, a3, a2
 ; RV32VB-NEXT:    lbu a3, 623(a0)
 ; RV32VB-NEXT:    lbu a0, 75(a0)
-; RV32VB-NEXT:    slli a4, a4, 8
-; RV32VB-NEXT:    or a2, a2, a4
+; RV32VB-NEXT:    slli a5, a5, 8
+; RV32VB-NEXT:    or a4, a4, a5
 ; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    slli a0, a0, 24
 ; RV32VB-NEXT:    or a0, a0, a3
-; RV32VB-NEXT:    or a0, a2, a0
+; RV32VB-NEXT:    or a1, a1, a2
+; RV32VB-NEXT:    or a0, a4, a0
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a1
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a0
@@ -2378,21 +2352,21 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu a2, 1(a0)
 ; RV32VB-PACK-NEXT:    lbu a3, 22(a0)
 ; RV32VB-PACK-NEXT:    lbu a4, 31(a0)
-; RV32VB-PACK-NEXT:    packh a1, a1, a2
-; RV32VB-PACK-NEXT:    lbu a2, 623(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 44(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 55(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 623(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 44(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 55(a0)
 ; RV32VB-PACK-NEXT:    lbu a0, 75(a0)
-; RV32VB-PACK-NEXT:    packh a3, a3, a4
-; RV32VB-PACK-NEXT:    pack a1, a1, a3
-; RV32VB-PACK-NEXT:    packh a3, a5, a6
-; RV32VB-PACK-NEXT:    packh a0, a2, a0
+; RV32VB-PACK-NEXT:    packh a1, a1, a2
+; RV32VB-PACK-NEXT:    packh a2, a3, a4
+; RV32VB-PACK-NEXT:    packh a3, a6, a7
+; RV32VB-PACK-NEXT:    packh a0, a5, a0
+; RV32VB-PACK-NEXT:    pack a1, a1, a2
+; RV32VB-PACK-NEXT:    packh a2, a0, a0
 ; RV32VB-PACK-NEXT:    pack a0, a3, a0
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a1
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
-; RV32VB-PACK-NEXT:    packh a0, a0, a0
-; RV32VB-PACK-NEXT:    pack a0, a0, a0
+; RV32VB-PACK-NEXT:    pack a0, a2, a2
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    ret
@@ -2422,26 +2396,26 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ; RVA22U64-LABEL: buildvec_v16i8_undef_high_half:
 ; RVA22U64:       # %bb.0:
 ; RVA22U64-NEXT:    lbu a1, 1(a0)
-; RVA22U64-NEXT:    lbu a2, 0(a0)
-; RVA22U64-NEXT:    lbu a3, 22(a0)
-; RVA22U64-NEXT:    lbu a4, 31(a0)
+; RVA22U64-NEXT:    lbu a2, 22(a0)
+; RVA22U64-NEXT:    lbu a3, 31(a0)
+; RVA22U64-NEXT:    lbu a4, 0(a0)
 ; RVA22U64-NEXT:    slli a1, a1, 8
-; RVA22U64-NEXT:    or a1, a1, a2
-; RVA22U64-NEXT:    slli a3, a3, 16
-; RVA22U64-NEXT:    slli a4, a4, 24
-; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    or a1, a1, a3
-; RVA22U64-NEXT:    lbu a2, 44(a0)
-; RVA22U64-NEXT:    lbu a3, 55(a0)
-; RVA22U64-NEXT:    lbu a4, 623(a0)
-; RVA22U64-NEXT:    lbu a0, 75(a0)
-; RVA22U64-NEXT:    slli a2, a2, 32
-; RVA22U64-NEXT:    slli a3, a3, 40
+; RVA22U64-NEXT:    slli a2, a2, 16
+; RVA22U64-NEXT:    slli a3, a3, 24
+; RVA22U64-NEXT:    or a1, a1, a4
 ; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    slli a4, a4, 48
+; RVA22U64-NEXT:    lbu a3, 44(a0)
+; RVA22U64-NEXT:    lbu a4, 55(a0)
+; RVA22U64-NEXT:    lbu a5, 623(a0)
+; RVA22U64-NEXT:    lbu a0, 75(a0)
+; RVA22U64-NEXT:    slli a3, a3, 32
+; RVA22U64-NEXT:    slli a4, a4, 40
+; RVA22U64-NEXT:    or a3, a3, a4
+; RVA22U64-NEXT:    slli a5, a5, 48
 ; RVA22U64-NEXT:    slli a0, a0, 56
-; RVA22U64-NEXT:    or a0, a0, a4
-; RVA22U64-NEXT:    or a0, a0, a2
+; RVA22U64-NEXT:    or a0, a0, a5
+; RVA22U64-NEXT:    or a1, a1, a2
+; RVA22U64-NEXT:    or a0, a0, a3
 ; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVA22U64-NEXT:    vmv.v.x v8, a0
@@ -2450,26 +2424,26 @@ define <16 x i8> @buildvec_v16i8_undef_high_half(ptr %p) {
 ;
 ; RVA22U64-PACK-LABEL: buildvec_v16i8_undef_high_half:
 ; RVA22U64-PACK:       # %bb.0:
-; RVA22U64-PACK-NEXT:    lbu a1, 0(a0)
-; RVA22U64-PACK-NEXT:    lbu a2, 1(a0)
-; RVA22U64-PACK-NEXT:    lbu a6, 22(a0)
+; RVA22U64-PACK-NEXT:    lbu a6, 0(a0)
+; RVA22U64-PACK-NEXT:    lbu a7, 1(a0)
+; RVA22U64-PACK-NEXT:    lbu t0, 22(a0)
 ; RVA22U64-PACK-NEXT:    lbu a4, 31(a0)
-; RVA22U64-PACK-NEXT:    packh a1, a1, a2
-; RVA22U64-PACK-NEXT:    lbu a2, 623(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 44(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 55(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 623(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 44(a0)
+; RVA22U64-PACK-NEXT:    lbu a2, 55(a0)
 ; RVA22U64-PACK-NEXT:    lbu a0, 75(a0)
-; RVA22U64-PACK-NEXT:    packh a4, a6, a4
-; RVA22U64-PACK-NEXT:    packw a1, a1, a4
-; RVA22U64-PACK-NEXT:    packh a3, a5, a3
-; RVA22U64-PACK-NEXT:    packh a0, a2, a0
-; RVA22U64-PACK-NEXT:    packw a0, a3, a0
-; RVA22U64-PACK-NEXT:    pack a0, a1, a0
+; RVA22U64-PACK-NEXT:    packh a3, a6, a7
+; RVA22U64-PACK-NEXT:    packh a4, t0, a4
+; RVA22U64-PACK-NEXT:    packh a1, a1, a2
+; RVA22U64-PACK-NEXT:    packh a0, a5, a0
+; RVA22U64-PACK-NEXT:    packw a2, a3, a4
+; RVA22U64-PACK-NEXT:    packh a3, a0, a0
+; RVA22U64-PACK-NEXT:    packw a3, a3, a3
+; RVA22U64-PACK-NEXT:    packw a0, a1, a0
+; RVA22U64-PACK-NEXT:    pack a0, a2, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVA22U64-PACK-NEXT:    vmv.v.x v8, a0
-; RVA22U64-PACK-NEXT:    packh a0, a0, a0
-; RVA22U64-PACK-NEXT:    packw a0, a0, a0
-; RVA22U64-PACK-NEXT:    pack a0, a0, a0
+; RVA22U64-PACK-NEXT:    pack a0, a3, a3
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-PACK-NEXT:    ret
 ;
@@ -2530,54 +2504,53 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ; RV32-ONLY-NEXT:    lbu a3, 44(a0)
 ; RV32-ONLY-NEXT:    lbu a4, 55(a0)
 ; RV32-ONLY-NEXT:    lbu a5, 75(a0)
+; RV32-ONLY-NEXT:    li a6, 255
+; RV32-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV32-ONLY-NEXT:    vmv.s.x v0, a6
 ; RV32-ONLY-NEXT:    lbu a6, 82(a0)
 ; RV32-ONLY-NEXT:    lbu a7, 93(a0)
 ; RV32-ONLY-NEXT:    lbu t0, 105(a0)
 ; RV32-ONLY-NEXT:    lbu a0, 161(a0)
-; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a2
+; RV32-ONLY-NEXT:    vmv.v.x v9, a6
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, a7
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t0
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-ONLY-NEXT:    vslide1down.vx v9, v8, a5
-; RV32-ONLY-NEXT:    vmv.v.x v8, a6
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a7
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t0
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-ONLY-NEXT:    li a0, 255
-; RV32-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-ONLY-NEXT:    vmv.s.x v0, a0
-; RV32-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV32-ONLY-NEXT:    vslidedown.vi v8, v8, 4
-; RV32-ONLY-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, a0
+; RV32-ONLY-NEXT:    vslide1down.vx v10, v8, a5
+; RV32-ONLY-NEXT:    vslidedown.vi v8, v9, 4
+; RV32-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV32-ONLY-NEXT:    ret
 ;
 ; RV32VB-LABEL: buildvec_v16i8_undef_edges:
 ; RV32VB:       # %bb.0:
-; RV32VB-NEXT:    lbu a1, 55(a0)
-; RV32VB-NEXT:    lbu a2, 31(a0)
-; RV32VB-NEXT:    lbu a3, 44(a0)
-; RV32VB-NEXT:    lbu a4, 623(a0)
-; RV32VB-NEXT:    lbu a5, 75(a0)
-; RV32VB-NEXT:    slli a1, a1, 8
+; RV32VB-NEXT:    lbu a1, 623(a0)
+; RV32VB-NEXT:    lbu a2, 55(a0)
+; RV32VB-NEXT:    lbu a3, 75(a0)
+; RV32VB-NEXT:    lbu a4, 31(a0)
+; RV32VB-NEXT:    lbu a5, 44(a0)
+; RV32VB-NEXT:    slli a2, a2, 8
+; RV32VB-NEXT:    slli a1, a1, 16
+; RV32VB-NEXT:    slli a3, a3, 24
+; RV32VB-NEXT:    or a2, a5, a2
+; RV32VB-NEXT:    lbu a5, 82(a0)
+; RV32VB-NEXT:    lbu a6, 93(a0)
 ; RV32VB-NEXT:    or a1, a3, a1
-; RV32VB-NEXT:    slli a4, a4, 16
-; RV32VB-NEXT:    slli a5, a5, 24
-; RV32VB-NEXT:    or a4, a5, a4
-; RV32VB-NEXT:    lbu a3, 82(a0)
-; RV32VB-NEXT:    lbu a5, 93(a0)
-; RV32VB-NEXT:    or a1, a1, a4
-; RV32VB-NEXT:    lbu a4, 105(a0)
+; RV32VB-NEXT:    lbu a3, 105(a0)
 ; RV32VB-NEXT:    lbu a0, 161(a0)
-; RV32VB-NEXT:    slli a5, a5, 8
-; RV32VB-NEXT:    or a3, a3, a5
-; RV32VB-NEXT:    slli a2, a2, 24
-; RV32VB-NEXT:    slli a4, a4, 16
+; RV32VB-NEXT:    slli a6, a6, 8
+; RV32VB-NEXT:    or a5, a5, a6
+; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    slli a0, a0, 24
-; RV32VB-NEXT:    or a0, a0, a4
-; RV32VB-NEXT:    or a0, a3, a0
+; RV32VB-NEXT:    or a0, a0, a3
+; RV32VB-NEXT:    slli a4, a4, 24
+; RV32VB-NEXT:    or a1, a2, a1
+; RV32VB-NEXT:    or a0, a5, a0
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32VB-NEXT:    vmv.v.x v8, a2
+; RV32VB-NEXT:    vmv.v.x v8, a4
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, zero
@@ -2590,18 +2563,18 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu a3, 44(a0)
 ; RV32VB-PACK-NEXT:    lbu a4, 55(a0)
 ; RV32VB-PACK-NEXT:    lbu a5, 75(a0)
-; RV32VB-PACK-NEXT:    packh a2, a0, a2
+; RV32VB-PACK-NEXT:    lbu a6, 82(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 93(a0)
+; RV32VB-PACK-NEXT:    lbu t0, 105(a0)
+; RV32VB-PACK-NEXT:    lbu a0, 161(a0)
 ; RV32VB-PACK-NEXT:    packh a3, a3, a4
 ; RV32VB-PACK-NEXT:    packh a1, a1, a5
-; RV32VB-PACK-NEXT:    lbu a4, 82(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 93(a0)
-; RV32VB-PACK-NEXT:    pack a1, a3, a1
-; RV32VB-PACK-NEXT:    lbu a3, 105(a0)
-; RV32VB-PACK-NEXT:    lbu a0, 161(a0)
-; RV32VB-PACK-NEXT:    packh a4, a4, a5
+; RV32VB-PACK-NEXT:    packh a4, a6, a7
+; RV32VB-PACK-NEXT:    packh a0, t0, a0
 ; RV32VB-PACK-NEXT:    packh a5, a0, a0
+; RV32VB-PACK-NEXT:    packh a2, a0, a2
 ; RV32VB-PACK-NEXT:    pack a2, a5, a2
-; RV32VB-PACK-NEXT:    packh a0, a3, a0
+; RV32VB-PACK-NEXT:    pack a1, a3, a1
 ; RV32VB-PACK-NEXT:    pack a0, a4, a0
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a2
@@ -2618,84 +2591,83 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ; RV64V-ONLY-NEXT:    lbu a3, 44(a0)
 ; RV64V-ONLY-NEXT:    lbu a4, 55(a0)
 ; RV64V-ONLY-NEXT:    lbu a5, 75(a0)
+; RV64V-ONLY-NEXT:    li a6, 255
+; RV64V-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64V-ONLY-NEXT:    vmv.s.x v0, a6
 ; RV64V-ONLY-NEXT:    lbu a6, 82(a0)
 ; RV64V-ONLY-NEXT:    lbu a7, 93(a0)
 ; RV64V-ONLY-NEXT:    lbu t0, 105(a0)
 ; RV64V-ONLY-NEXT:    lbu a0, 161(a0)
-; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a2
+; RV64V-ONLY-NEXT:    vmv.v.x v9, a6
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a3
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, a7
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t0
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a1
-; RV64V-ONLY-NEXT:    vslide1down.vx v9, v8, a5
-; RV64V-ONLY-NEXT:    vmv.v.x v8, a6
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a7
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t0
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a0
-; RV64V-ONLY-NEXT:    li a0, 255
-; RV64V-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64V-ONLY-NEXT:    vmv.s.x v0, a0
-; RV64V-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64V-ONLY-NEXT:    vslidedown.vi v8, v8, 4
-; RV64V-ONLY-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, a0
+; RV64V-ONLY-NEXT:    vslide1down.vx v10, v8, a5
+; RV64V-ONLY-NEXT:    vslidedown.vi v8, v9, 4
+; RV64V-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64V-ONLY-NEXT:    ret
 ;
 ; RVA22U64-LABEL: buildvec_v16i8_undef_edges:
 ; RVA22U64:       # %bb.0:
-; RVA22U64-NEXT:    lbu a1, 31(a0)
+; RVA22U64-NEXT:    lbu a6, 31(a0)
 ; RVA22U64-NEXT:    lbu a2, 44(a0)
 ; RVA22U64-NEXT:    lbu a3, 55(a0)
 ; RVA22U64-NEXT:    lbu a4, 623(a0)
 ; RVA22U64-NEXT:    lbu a5, 75(a0)
 ; RVA22U64-NEXT:    slli a2, a2, 32
 ; RVA22U64-NEXT:    slli a3, a3, 40
-; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    slli a1, a1, 24
 ; RVA22U64-NEXT:    slli a4, a4, 48
 ; RVA22U64-NEXT:    slli a5, a5, 56
-; RVA22U64-NEXT:    or a4, a4, a5
-; RVA22U64-NEXT:    or a2, a2, a4
+; RVA22U64-NEXT:    or a2, a2, a3
 ; RVA22U64-NEXT:    lbu a3, 82(a0)
-; RVA22U64-NEXT:    lbu a4, 93(a0)
-; RVA22U64-NEXT:    add.uw a1, a1, a2
-; RVA22U64-NEXT:    lbu a2, 105(a0)
+; RVA22U64-NEXT:    lbu a1, 93(a0)
+; RVA22U64-NEXT:    or a4, a4, a5
+; RVA22U64-NEXT:    lbu a5, 105(a0)
 ; RVA22U64-NEXT:    lbu a0, 161(a0)
-; RVA22U64-NEXT:    slli a4, a4, 8
-; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    slli a2, a2, 16
+; RVA22U64-NEXT:    slli a1, a1, 8
+; RVA22U64-NEXT:    or a1, a1, a3
+; RVA22U64-NEXT:    slli a5, a5, 16
 ; RVA22U64-NEXT:    slli a0, a0, 24
-; RVA22U64-NEXT:    or a0, a0, a2
-; RVA22U64-NEXT:    or a0, a0, a3
+; RVA22U64-NEXT:    or a0, a0, a5
+; RVA22U64-NEXT:    slli a6, a6, 24
+; RVA22U64-NEXT:    or a2, a2, a4
+; RVA22U64-NEXT:    add.uw a2, a6, a2
+; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RVA22U64-NEXT:    vmv.v.x v8, a1
+; RVA22U64-NEXT:    vmv.v.x v8, a2
 ; RVA22U64-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-NEXT:    ret
 ;
 ; RVA22U64-PACK-LABEL: buildvec_v16i8_undef_edges:
 ; RVA22U64-PACK:       # %bb.0:
-; RVA22U64-PACK-NEXT:    lbu a1, 623(a0)
-; RVA22U64-PACK-NEXT:    lbu a2, 31(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 44(a0)
+; RVA22U64-PACK-NEXT:    lbu a7, 623(a0)
+; RVA22U64-PACK-NEXT:    lbu a6, 31(a0)
+; RVA22U64-PACK-NEXT:    lbu t0, 44(a0)
 ; RVA22U64-PACK-NEXT:    lbu a4, 55(a0)
 ; RVA22U64-PACK-NEXT:    lbu a5, 75(a0)
-; RVA22U64-PACK-NEXT:    packh a6, a0, a2
-; RVA22U64-PACK-NEXT:    packh a2, a0, a0
-; RVA22U64-PACK-NEXT:    packh a3, a3, a4
-; RVA22U64-PACK-NEXT:    packh a1, a1, a5
-; RVA22U64-PACK-NEXT:    packw a7, a3, a1
-; RVA22U64-PACK-NEXT:    lbu a3, 82(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 93(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 105(a0)
+; RVA22U64-PACK-NEXT:    lbu a2, 82(a0)
+; RVA22U64-PACK-NEXT:    lbu a1, 93(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 105(a0)
 ; RVA22U64-PACK-NEXT:    lbu a0, 161(a0)
-; RVA22U64-PACK-NEXT:    packw a1, a2, a6
-; RVA22U64-PACK-NEXT:    pack a1, a1, a7
-; RVA22U64-PACK-NEXT:    packh a3, a3, a4
-; RVA22U64-PACK-NEXT:    packh a0, a5, a0
-; RVA22U64-PACK-NEXT:    packw a0, a3, a0
+; RVA22U64-PACK-NEXT:    packh a4, t0, a4
+; RVA22U64-PACK-NEXT:    packh a5, a7, a5
+; RVA22U64-PACK-NEXT:    packh a1, a2, a1
+; RVA22U64-PACK-NEXT:    packh a0, a3, a0
+; RVA22U64-PACK-NEXT:    packh a2, a0, a0
+; RVA22U64-PACK-NEXT:    packh a3, a0, a6
+; RVA22U64-PACK-NEXT:    packw a3, a2, a3
+; RVA22U64-PACK-NEXT:    packw a2, a2, a2
+; RVA22U64-PACK-NEXT:    packw a4, a4, a5
+; RVA22U64-PACK-NEXT:    packw a0, a1, a0
+; RVA22U64-PACK-NEXT:    pack a1, a3, a4
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVA22U64-PACK-NEXT:    vmv.v.x v8, a1
-; RVA22U64-PACK-NEXT:    packw a1, a2, a2
-; RVA22U64-PACK-NEXT:    pack a0, a0, a1
+; RVA22U64-PACK-NEXT:    pack a0, a0, a2
 ; RVA22U64-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RVA22U64-PACK-NEXT:    ret
 ;
@@ -2706,26 +2678,25 @@ define <16 x i8> @buildvec_v16i8_undef_edges(ptr %p) {
 ; RV64ZVE32-NEXT:    lbu a3, 44(a0)
 ; RV64ZVE32-NEXT:    lbu a4, 55(a0)
 ; RV64ZVE32-NEXT:    lbu a5, 75(a0)
+; RV64ZVE32-NEXT:    li a6, 255
+; RV64ZVE32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32-NEXT:    vmv.s.x v0, a6
 ; RV64ZVE32-NEXT:    lbu a6, 82(a0)
 ; RV64ZVE32-NEXT:    lbu a7, 93(a0)
 ; RV64ZVE32-NEXT:    lbu t0, 105(a0)
 ; RV64ZVE32-NEXT:    lbu a0, 161(a0)
-; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a2
+; RV64ZVE32-NEXT:    vmv.v.x v9, a6
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, a7
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t0
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a1
-; RV64ZVE32-NEXT:    vslide1down.vx v9, v8, a5
-; RV64ZVE32-NEXT:    vmv.v.x v8, a6
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t0
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32-NEXT:    li a0, 255
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32-NEXT:    vmv.s.x v0, a0
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64ZVE32-NEXT:    vslidedown.vi v8, v8, 4
-; RV64ZVE32-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, a0
+; RV64ZVE32-NEXT:    vslide1down.vx v10, v8, a5
+; RV64ZVE32-NEXT:    vslidedown.vi v8, v9, 4
+; RV64ZVE32-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64ZVE32-NEXT:    ret
   %p4 = getelementptr i8, ptr %p, i32 31
   %p5 = getelementptr i8, ptr %p, i32 44
@@ -2770,58 +2741,57 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RV32-ONLY-NEXT:    lbu a6, 82(a0)
 ; RV32-ONLY-NEXT:    lbu a7, 93(a0)
 ; RV32-ONLY-NEXT:    lbu t0, 124(a0)
+; RV32-ONLY-NEXT:    li t1, 255
+; RV32-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV32-ONLY-NEXT:    vmv.s.x v0, t1
 ; RV32-ONLY-NEXT:    lbu t1, 144(a0)
 ; RV32-ONLY-NEXT:    lbu a0, 154(a0)
-; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a1
+; RV32-ONLY-NEXT:    vmv.v.x v9, a6
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, a7
 ; RV32-ONLY-NEXT:    vslidedown.vi v8, v8, 2
+; RV32-ONLY-NEXT:    vslidedown.vi v9, v9, 2
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a3
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t0
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-ONLY-NEXT:    vslidedown.vi v9, v9, 1
 ; RV32-ONLY-NEXT:    vslidedown.vi v8, v8, 1
-; RV32-ONLY-NEXT:    vslide1down.vx v9, v8, a5
-; RV32-ONLY-NEXT:    vmv.v.x v8, a6
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a7
-; RV32-ONLY-NEXT:    vslidedown.vi v8, v8, 2
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t0
-; RV32-ONLY-NEXT:    vslidedown.vi v8, v8, 1
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, t1
-; RV32-ONLY-NEXT:    li a1, 255
-; RV32-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-ONLY-NEXT:    vmv.s.x v0, a1
-; RV32-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a0
-; RV32-ONLY-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, t1
+; RV32-ONLY-NEXT:    vslide1down.vx v10, v8, a5
+; RV32-ONLY-NEXT:    vslide1down.vx v8, v9, a0
+; RV32-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV32-ONLY-NEXT:    ret
 ;
 ; RV32VB-LABEL: buildvec_v16i8_loads_undef_scattered:
 ; RV32VB:       # %bb.0:
 ; RV32VB-NEXT:    lbu a1, 1(a0)
 ; RV32VB-NEXT:    lbu a2, 0(a0)
-; RV32VB-NEXT:    lbu a3, 55(a0)
-; RV32VB-NEXT:    lbu a4, 44(a0)
+; RV32VB-NEXT:    lbu a3, 44(a0)
+; RV32VB-NEXT:    lbu a4, 55(a0)
 ; RV32VB-NEXT:    slli a1, a1, 8
 ; RV32VB-NEXT:    or a1, a2, a1
-; RV32VB-NEXT:    slli a3, a3, 8
-; RV32VB-NEXT:    or a3, a4, a3
 ; RV32VB-NEXT:    lbu a2, 75(a0)
-; RV32VB-NEXT:    lbu a4, 82(a0)
-; RV32VB-NEXT:    lbu a5, 93(a0)
-; RV32VB-NEXT:    lbu a6, 124(a0)
-; RV32VB-NEXT:    slli a2, a2, 24
-; RV32VB-NEXT:    or a2, a3, a2
-; RV32VB-NEXT:    lbu a3, 144(a0)
+; RV32VB-NEXT:    lbu a5, 82(a0)
+; RV32VB-NEXT:    lbu a6, 93(a0)
+; RV32VB-NEXT:    lbu a7, 124(a0)
+; RV32VB-NEXT:    slli a4, a4, 8
+; RV32VB-NEXT:    or a3, a3, a4
+; RV32VB-NEXT:    lbu a4, 144(a0)
 ; RV32VB-NEXT:    lbu a0, 154(a0)
-; RV32VB-NEXT:    slli a5, a5, 8
-; RV32VB-NEXT:    or a4, a4, a5
-; RV32VB-NEXT:    slli a3, a3, 16
+; RV32VB-NEXT:    slli a6, a6, 8
+; RV32VB-NEXT:    or a5, a5, a6
+; RV32VB-NEXT:    slli a4, a4, 16
 ; RV32VB-NEXT:    slli a0, a0, 24
-; RV32VB-NEXT:    or a0, a0, a3
-; RV32VB-NEXT:    or a0, a6, a0
+; RV32VB-NEXT:    or a0, a0, a4
+; RV32VB-NEXT:    slli a2, a2, 24
+; RV32VB-NEXT:    or a2, a3, a2
+; RV32VB-NEXT:    or a0, a7, a0
 ; RV32VB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a1
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a2
-; RV32VB-NEXT:    vslide1down.vx v8, v8, a4
+; RV32VB-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32VB-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-NEXT:    ret
 ;
@@ -2831,26 +2801,26 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RV32VB-PACK-NEXT:    lbu a2, 1(a0)
 ; RV32VB-PACK-NEXT:    lbu a3, 44(a0)
 ; RV32VB-PACK-NEXT:    lbu a4, 55(a0)
+; RV32VB-PACK-NEXT:    lbu a5, 75(a0)
+; RV32VB-PACK-NEXT:    lbu a6, 82(a0)
+; RV32VB-PACK-NEXT:    lbu a7, 93(a0)
 ; RV32VB-PACK-NEXT:    packh a1, a1, a2
-; RV32VB-PACK-NEXT:    packh a2, a3, a4
-; RV32VB-PACK-NEXT:    lbu a3, 75(a0)
-; RV32VB-PACK-NEXT:    lbu a4, 82(a0)
-; RV32VB-PACK-NEXT:    lbu a5, 93(a0)
-; RV32VB-PACK-NEXT:    lbu a6, 124(a0)
-; RV32VB-PACK-NEXT:    lbu a7, 144(a0)
-; RV32VB-PACK-NEXT:    lbu a0, 154(a0)
-; RV32VB-PACK-NEXT:    packh a3, a0, a3
-; RV32VB-PACK-NEXT:    pack a2, a2, a3
-; RV32VB-PACK-NEXT:    packh a3, a4, a5
-; RV32VB-PACK-NEXT:    packh a0, a7, a0
-; RV32VB-PACK-NEXT:    packh a4, a6, a0
-; RV32VB-PACK-NEXT:    pack a0, a4, a0
-; RV32VB-PACK-NEXT:    packh a4, a0, a0
-; RV32VB-PACK-NEXT:    pack a1, a1, a4
+; RV32VB-PACK-NEXT:    lbu a2, 144(a0)
+; RV32VB-PACK-NEXT:    lbu t0, 154(a0)
+; RV32VB-PACK-NEXT:    packh a3, a3, a4
+; RV32VB-PACK-NEXT:    lbu a0, 124(a0)
+; RV32VB-PACK-NEXT:    packh a4, a6, a7
+; RV32VB-PACK-NEXT:    packh a2, a2, t0
+; RV32VB-PACK-NEXT:    packh a5, a0, a5
+; RV32VB-PACK-NEXT:    pack a3, a3, a5
+; RV32VB-PACK-NEXT:    packh a5, a0, a0
+; RV32VB-PACK-NEXT:    packh a0, a0, a0
+; RV32VB-PACK-NEXT:    pack a0, a0, a2
+; RV32VB-PACK-NEXT:    pack a1, a1, a5
 ; RV32VB-PACK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a1
-; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a2
-; RV32VB-PACK-NEXT:    pack a1, a3, a4
+; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a3
+; RV32VB-PACK-NEXT:    pack a1, a4, a5
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    ret
@@ -2865,28 +2835,27 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RV64V-ONLY-NEXT:    lbu a6, 82(a0)
 ; RV64V-ONLY-NEXT:    lbu a7, 93(a0)
 ; RV64V-ONLY-NEXT:    lbu t0, 124(a0)
+; RV64V-ONLY-NEXT:    li t1, 255
+; RV64V-ONLY-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64V-ONLY-NEXT:    vmv.s.x v0, t1
 ; RV64V-ONLY-NEXT:    lbu t1, 144(a0)
 ; RV64V-ONLY-NEXT:    lbu a0, 154(a0)
-; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64V-ONLY-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a1
+; RV64V-ONLY-NEXT:    vmv.v.x v9, a6
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a2
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, a7
 ; RV64V-ONLY-NEXT:    vslidedown.vi v8, v8, 2
+; RV64V-ONLY-NEXT:    vslidedown.vi v9, v9, 2
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a3
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t0
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a4
+; RV64V-ONLY-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64V-ONLY-NEXT:    vslidedown.vi v8, v8, 1
-; RV64V-ONLY-NEXT:    vslide1down.vx v9, v8, a5
-; RV64V-ONLY-NEXT:    vmv.v.x v8, a6
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a7
-; RV64V-ONLY-NEXT:    vslidedown.vi v8, v8, 2
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t0
-; RV64V-ONLY-NEXT:    vslidedown.vi v8, v8, 1
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, t1
-; RV64V-ONLY-NEXT:    li a1, 255
-; RV64V-ONLY-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64V-ONLY-NEXT:    vmv.s.x v0, a1
-; RV64V-ONLY-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a0
-; RV64V-ONLY-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, t1
+; RV64V-ONLY-NEXT:    vslide1down.vx v10, v8, a5
+; RV64V-ONLY-NEXT:    vslide1down.vx v8, v9, a0
+; RV64V-ONLY-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64V-ONLY-NEXT:    ret
 ;
 ; RVA22U64-LABEL: buildvec_v16i8_loads_undef_scattered:
@@ -2897,26 +2866,26 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RVA22U64-NEXT:    lbu a4, 55(a0)
 ; RVA22U64-NEXT:    slli a1, a1, 8
 ; RVA22U64-NEXT:    or a6, a2, a1
+; RVA22U64-NEXT:    lbu a7, 75(a0)
+; RVA22U64-NEXT:    lbu a5, 82(a0)
+; RVA22U64-NEXT:    lbu a1, 93(a0)
+; RVA22U64-NEXT:    lbu a2, 124(a0)
 ; RVA22U64-NEXT:    slli a3, a3, 32
 ; RVA22U64-NEXT:    slli a4, a4, 40
 ; RVA22U64-NEXT:    or a3, a3, a4
-; RVA22U64-NEXT:    lbu a2, 75(a0)
-; RVA22U64-NEXT:    lbu a4, 82(a0)
-; RVA22U64-NEXT:    lbu a5, 93(a0)
-; RVA22U64-NEXT:    lbu a1, 124(a0)
-; RVA22U64-NEXT:    slli a2, a2, 56
-; RVA22U64-NEXT:    or a2, a2, a3
-; RVA22U64-NEXT:    or a2, a6, a2
-; RVA22U64-NEXT:    lbu a3, 144(a0)
+; RVA22U64-NEXT:    lbu a4, 144(a0)
 ; RVA22U64-NEXT:    lbu a0, 154(a0)
-; RVA22U64-NEXT:    slli a5, a5, 8
-; RVA22U64-NEXT:    or a4, a4, a5
-; RVA22U64-NEXT:    slli a3, a3, 48
+; RVA22U64-NEXT:    slli a1, a1, 8
+; RVA22U64-NEXT:    or a1, a1, a5
+; RVA22U64-NEXT:    slli a4, a4, 48
 ; RVA22U64-NEXT:    slli a0, a0, 56
-; RVA22U64-NEXT:    or a0, a0, a3
-; RVA22U64-NEXT:    slli a1, a1, 32
-; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    or a0, a0, a4
+; RVA22U64-NEXT:    slli a7, a7, 56
+; RVA22U64-NEXT:    or a3, a7, a3
+; RVA22U64-NEXT:    slli a2, a2, 32
+; RVA22U64-NEXT:    or a0, a0, a2
+; RVA22U64-NEXT:    or a2, a6, a3
+; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVA22U64-NEXT:    vmv.v.x v8, a2
 ; RVA22U64-NEXT:    vslide1down.vx v8, v8, a0
@@ -2926,26 +2895,26 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RVA22U64-PACK:       # %bb.0:
 ; RVA22U64-PACK-NEXT:    lbu a1, 0(a0)
 ; RVA22U64-PACK-NEXT:    lbu a2, 1(a0)
-; RVA22U64-PACK-NEXT:    lbu a3, 44(a0)
-; RVA22U64-PACK-NEXT:    lbu a4, 55(a0)
-; RVA22U64-PACK-NEXT:    packh a6, a1, a2
-; RVA22U64-PACK-NEXT:    packh a2, a3, a4
-; RVA22U64-PACK-NEXT:    lbu a3, 75(a0)
-; RVA22U64-PACK-NEXT:    lbu a7, 82(a0)
-; RVA22U64-PACK-NEXT:    lbu a5, 93(a0)
-; RVA22U64-PACK-NEXT:    lbu t0, 124(a0)
-; RVA22U64-PACK-NEXT:    packh a3, a0, a3
-; RVA22U64-PACK-NEXT:    packw a2, a2, a3
-; RVA22U64-PACK-NEXT:    packh a3, a0, a0
-; RVA22U64-PACK-NEXT:    lbu a4, 144(a0)
-; RVA22U64-PACK-NEXT:    lbu a0, 154(a0)
-; RVA22U64-PACK-NEXT:    packw a1, a6, a3
-; RVA22U64-PACK-NEXT:    pack a1, a1, a2
-; RVA22U64-PACK-NEXT:    packh a2, a7, a5
-; RVA22U64-PACK-NEXT:    packh a0, a4, a0
-; RVA22U64-PACK-NEXT:    packh a4, t0, a0
-; RVA22U64-PACK-NEXT:    packw a0, a4, a0
-; RVA22U64-PACK-NEXT:    packw a2, a2, a3
+; RVA22U64-PACK-NEXT:    lbu a7, 44(a0)
+; RVA22U64-PACK-NEXT:    lbu t0, 55(a0)
+; RVA22U64-PACK-NEXT:    lbu a6, 75(a0)
+; RVA22U64-PACK-NEXT:    lbu a5, 82(a0)
+; RVA22U64-PACK-NEXT:    lbu a3, 93(a0)
+; RVA22U64-PACK-NEXT:    packh t1, a1, a2
+; RVA22U64-PACK-NEXT:    lbu a2, 144(a0)
+; RVA22U64-PACK-NEXT:    lbu a4, 154(a0)
+; RVA22U64-PACK-NEXT:    packh a1, a7, t0
+; RVA22U64-PACK-NEXT:    lbu a0, 124(a0)
+; RVA22U64-PACK-NEXT:    packh a3, a5, a3
+; RVA22U64-PACK-NEXT:    packh a2, a2, a4
+; RVA22U64-PACK-NEXT:    packh a4, a0, a6
+; RVA22U64-PACK-NEXT:    packw a1, a1, a4
+; RVA22U64-PACK-NEXT:    packh a4, a0, a0
+; RVA22U64-PACK-NEXT:    packh a0, a0, a0
+; RVA22U64-PACK-NEXT:    packw a5, t1, a4
+; RVA22U64-PACK-NEXT:    packw a0, a0, a2
+; RVA22U64-PACK-NEXT:    packw a2, a3, a4
+; RVA22U64-PACK-NEXT:    pack a1, a5, a1
 ; RVA22U64-PACK-NEXT:    pack a0, a2, a0
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RVA22U64-PACK-NEXT:    vmv.v.x v8, a1
@@ -2962,28 +2931,27 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) {
 ; RV64ZVE32-NEXT:    lbu a6, 82(a0)
 ; RV64ZVE32-NEXT:    lbu a7, 93(a0)
 ; RV64ZVE32-NEXT:    lbu t0, 124(a0)
+; RV64ZVE32-NEXT:    li t1, 255
+; RV64ZVE32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32-NEXT:    vmv.s.x v0, t1
 ; RV64ZVE32-NEXT:    lbu t1, 144(a0)
 ; RV64ZVE32-NEXT:    lbu a0, 154(a0)
-; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64ZVE32-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32-NEXT:    vmv.v.x v9, a6
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, a7
 ; RV64ZVE32-NEXT:    vslidedown.vi v8, v8, 2
+; RV64ZVE32-NEXT:    vslidedown.vi v9, v9, 2
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t0
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64ZVE32-NEXT:    vslidedown.vi v8, v8, 1
-; RV64ZVE32-NEXT:    vslide1down.vx v9, v8, a5
-; RV64ZVE32-NEXT:    vmv.v.x v8, a6
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32-NEXT:    vslidedown.vi v8, v8, 2
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t0
-; RV64ZVE32-NEXT:    vslidedown.vi v8, v8, 1
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, t1
-; RV64ZVE32-NEXT:    li a1, 255
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32-NEXT:    vmv.s.x v0, a1
-; RV64ZVE32-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32-NEXT:    vslidedown.vi v8, v9, 8, v0.t
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, t1
+; RV64ZVE32-NEXT:    vslide1down.vx v10, v8, a5
+; RV64ZVE32-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32-NEXT:    vslidedown.vi v8, v10, 8, v0.t
 ; RV64ZVE32-NEXT:    ret
   %p2 = getelementptr i8, ptr %p, i32 1
   %p3 = getelementptr i8, ptr %p, i32 22
@@ -3042,91 +3010,91 @@ define <8 x i8> @buildvec_v8i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 %
 ; RV32-ONLY:       # %bb.0:
 ; RV32-ONLY-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV32-ONLY-NEXT:    vmv.v.x v8, a0
+; RV32-ONLY-NEXT:    vmv.v.x v9, a4
+; RV32-ONLY-NEXT:    vmv.v.i v0, 15
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, a5
 ; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a2
-; RV32-ONLY-NEXT:    vslide1down.vx v9, v8, a3
-; RV32-ONLY-NEXT:    vmv.v.x v8, a4
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a5
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a6
-; RV32-ONLY-NEXT:    vmv.v.i v0, 15
-; RV32-ONLY-NEXT:    vslide1down.vx v8, v8, a7
-; RV32-ONLY-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV32-ONLY-NEXT:    vslide1down.vx v9, v9, a6
+; RV32-ONLY-NEXT:    vslide1down.vx v10, v8, a3
+; RV32-ONLY-NEXT:    vslide1down.vx v8, v9, a7
+; RV32-ONLY-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV32-ONLY-NEXT:    ret
 ;
 ; RV32VB-LABEL: buildvec_v8i8_pack:
 ; RV32VB:       # %bb.0:
 ; RV32VB-NEXT:    slli a7, a7, 24
 ; RV32VB-NEXT:    andi a6, a6, 255
-; RV32VB-NEXT:    slli a6, a6, 16
-; RV32VB-NEXT:    or a6, a7, a6
 ; RV32VB-NEXT:    andi a4, a4, 255
 ; RV32VB-NEXT:    andi a5, a5, 255
-; RV32VB-NEXT:    slli a5, a5, 8
-; RV32VB-NEXT:    or a4, a4, a5
-; RV32VB-NEXT:    or a4, a4, a6
 ; RV32VB-NEXT:    slli a3, a3, 24
 ; RV32VB-NEXT:    andi a2, a2, 255
-; RV32VB-NEXT:    slli a2, a2, 16
-; RV32VB-NEXT:    or a2, a3, a2
 ; RV32VB-NEXT:    andi a0, a0, 255
 ; RV32VB-NEXT:    andi a1, a1, 255
+; RV32VB-NEXT:    slli a6, a6, 16
+; RV32VB-NEXT:    slli a5, a5, 8
+; RV32VB-NEXT:    slli a2, a2, 16
 ; RV32VB-NEXT:    slli a1, a1, 8
+; RV32VB-NEXT:    or a6, a7, a6
+; RV32VB-NEXT:    or a4, a4, a5
+; RV32VB-NEXT:    or a2, a3, a2
 ; RV32VB-NEXT:    or a0, a0, a1
+; RV32VB-NEXT:    or a1, a4, a6
 ; RV32VB-NEXT:    or a0, a0, a2
 ; RV32VB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a0
-; RV32VB-NEXT:    vslide1down.vx v8, v8, a4
+; RV32VB-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32VB-NEXT:    ret
 ;
 ; RV32VB-PACK-LABEL: buildvec_v8i8_pack:
 ; RV32VB-PACK:       # %bb.0:
 ; RV32VB-PACK-NEXT:    packh a6, a6, a7
 ; RV32VB-PACK-NEXT:    packh a4, a4, a5
-; RV32VB-PACK-NEXT:    pack a4, a4, a6
 ; RV32VB-PACK-NEXT:    packh a2, a2, a3
 ; RV32VB-PACK-NEXT:    packh a0, a0, a1
+; RV32VB-PACK-NEXT:    pack a1, a4, a6
 ; RV32VB-PACK-NEXT:    pack a0, a0, a2
 ; RV32VB-PACK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a0
-; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a4
+; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32VB-PACK-NEXT:    ret
 ;
 ; RV64V-ONLY-LABEL: buildvec_v8i8_pack:
 ; RV64V-ONLY:       # %bb.0:
 ; RV64V-ONLY-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV64V-ONLY-NEXT:    vmv.v.x v8, a0
+; RV64V-ONLY-NEXT:    vmv.v.x v9, a4
+; RV64V-ONLY-NEXT:    vmv.v.i v0, 15
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a1
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, a5
 ; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a2
-; RV64V-ONLY-NEXT:    vslide1down.vx v9, v8, a3
-; RV64V-ONLY-NEXT:    vmv.v.x v8, a4
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a5
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a6
-; RV64V-ONLY-NEXT:    vmv.v.i v0, 15
-; RV64V-ONLY-NEXT:    vslide1down.vx v8, v8, a7
-; RV64V-ONLY-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64V-ONLY-NEXT:    vslide1down.vx v9, v9, a6
+; RV64V-ONLY-NEXT:    vslide1down.vx v10, v8, a3
+; RV64V-ONLY-NEXT:    vslide1down.vx v8, v9, a7
+; RV64V-ONLY-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64V-ONLY-NEXT:    ret
 ;
 ; RVA22U64-LABEL: buildvec_v8i8_pack:
 ; RVA22U64:       # %bb.0:
-; RVA22U64-NEXT:    andi a4, a4, 255
-; RVA22U64-NEXT:    slli a4, a4, 32
+; RVA22U64-NEXT:    andi t0, a4, 255
 ; RVA22U64-NEXT:    andi a5, a5, 255
-; RVA22U64-NEXT:    slli a5, a5, 40
-; RVA22U64-NEXT:    or a4, a4, a5
 ; RVA22U64-NEXT:    slli a7, a7, 56
-; RVA22U64-NEXT:    andi a5, a6, 255
-; RVA22U64-NEXT:    slli a5, a5, 48
-; RVA22U64-NEXT:    or a5, a7, a5
-; RVA22U64-NEXT:    or a4, a4, a5
+; RVA22U64-NEXT:    andi a4, a6, 255
 ; RVA22U64-NEXT:    andi a2, a2, 255
-; RVA22U64-NEXT:    slli a2, a2, 16
 ; RVA22U64-NEXT:    andi a3, a3, 255
-; RVA22U64-NEXT:    slli a3, a3, 24
-; RVA22U64-NEXT:    or a2, a2, a3
 ; RVA22U64-NEXT:    andi a0, a0, 255
 ; RVA22U64-NEXT:    andi a1, a1, 255
+; RVA22U64-NEXT:    slli t0, t0, 32
+; RVA22U64-NEXT:    slli a5, a5, 40
+; RVA22U64-NEXT:    slli a4, a4, 48
+; RVA22U64-NEXT:    slli a2, a2, 16
+; RVA22U64-NEXT:    slli a3, a3, 24
 ; RVA22U64-NEXT:    slli a1, a1, 8
+; RVA22U64-NEXT:    or a5, a5, t0
+; RVA22U64-NEXT:    or a4, a7, a4
+; RVA22U64-NEXT:    or a2, a2, a3
 ; RVA22U64-NEXT:    or a0, a0, a1
+; RVA22U64-NEXT:    or a4, a4, a5
 ; RVA22U64-NEXT:    or a0, a0, a2
 ; RVA22U64-NEXT:    or a0, a0, a4
 ; RVA22U64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -3137,11 +3105,11 @@ define <8 x i8> @buildvec_v8i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 %
 ; RVA22U64-PACK:       # %bb.0:
 ; RVA22U64-PACK-NEXT:    packh a6, a6, a7
 ; RVA22U64-PACK-NEXT:    packh a4, a4, a5
-; RVA22U64-PACK-NEXT:    packw a4, a4, a6
 ; RVA22U64-PACK-NEXT:    packh a2, a2, a3
 ; RVA22U64-PACK-NEXT:    packh a0, a0, a1
+; RVA22U64-PACK-NEXT:    packw a1, a4, a6
 ; RVA22U64-PACK-NEXT:    packw a0, a0, a2
-; RVA22U64-PACK-NEXT:    pack a0, a0, a4
+; RVA22U64-PACK-NEXT:    pack a0, a0, a1
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RVA22U64-PACK-NEXT:    vmv.s.x v8, a0
 ; RVA22U64-PACK-NEXT:    ret
@@ -3150,15 +3118,15 @@ define <8 x i8> @buildvec_v8i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 %
 ; RV64ZVE32:       # %bb.0:
 ; RV64ZVE32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV64ZVE32-NEXT:    vmv.v.x v8, a0
+; RV64ZVE32-NEXT:    vmv.v.x v9, a4
+; RV64ZVE32-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a1
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, a5
 ; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32-NEXT:    vslide1down.vx v9, v8, a3
-; RV64ZVE32-NEXT:    vmv.v.x v8, a4
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32-NEXT:    vslide1down.vx v9, v9, a6
+; RV64ZVE32-NEXT:    vslide1down.vx v10, v8, a3
+; RV64ZVE32-NEXT:    vslide1down.vx v8, v9, a7
+; RV64ZVE32-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32-NEXT:    ret
   %v1 = insertelement <8 x i8> poison, i8 %e1, i32 0
   %v2 = insertelement <8 x i8> %v1, i8 %e2, i32 1
@@ -3188,32 +3156,32 @@ define <6 x i8> @buildvec_v6i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 %
 ; RV32VB:       # %bb.0:
 ; RV32VB-NEXT:    slli a3, a3, 24
 ; RV32VB-NEXT:    andi a2, a2, 255
-; RV32VB-NEXT:    slli a2, a2, 16
-; RV32VB-NEXT:    or a2, a3, a2
 ; RV32VB-NEXT:    andi a0, a0, 255
 ; RV32VB-NEXT:    andi a1, a1, 255
+; RV32VB-NEXT:    andi a4, a4, 255
+; RV32VB-NEXT:    andi a5, a5, 255
+; RV32VB-NEXT:    slli a2, a2, 16
 ; RV32VB-NEXT:    slli a1, a1, 8
+; RV32VB-NEXT:    slli a5, a5, 8
+; RV32VB-NEXT:    or a2, a3, a2
 ; RV32VB-NEXT:    or a0, a0, a1
 ; RV32VB-NEXT:    or a0, a0, a2
-; RV32VB-NEXT:    andi a1, a4, 255
-; RV32VB-NEXT:    andi a2, a5, 255
-; RV32VB-NEXT:    slli a2, a2, 8
-; RV32VB-NEXT:    or a1, a1, a2
+; RV32VB-NEXT:    or a4, a4, a5
 ; RV32VB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a0
-; RV32VB-NEXT:    vslide1down.vx v8, v8, a1
+; RV32VB-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32VB-NEXT:    ret
 ;
 ; RV32VB-PACK-LABEL: buildvec_v6i8_pack:
 ; RV32VB-PACK:       # %bb.0:
 ; RV32VB-PACK-NEXT:    packh a2, a2, a3
 ; RV32VB-PACK-NEXT:    packh a0, a0, a1
-; RV32VB-PACK-NEXT:    pack a0, a0, a2
 ; RV32VB-PACK-NEXT:    packh a1, a4, a5
+; RV32VB-PACK-NEXT:    packh a3, a0, a0
+; RV32VB-PACK-NEXT:    pack a0, a0, a2
 ; RV32VB-PACK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32VB-PACK-NEXT:    vmv.v.x v8, a0
-; RV32VB-PACK-NEXT:    packh a0, a0, a0
-; RV32VB-PACK-NEXT:    pack a0, a1, a0
+; RV32VB-PACK-NEXT:    pack a0, a1, a3
 ; RV32VB-PACK-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32VB-PACK-NEXT:    ret
 ;
@@ -3232,21 +3200,21 @@ define <6 x i8> @buildvec_v6i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 %
 ; RVA22U64-LABEL: buildvec_v6i8_pack:
 ; RVA22U64:       # %bb.0:
 ; RVA22U64-NEXT:    andi a2, a2, 255
-; RVA22U64-NEXT:    slli a2, a2, 16
 ; RVA22U64-NEXT:    andi a3, a3, 255
-; RVA22U64-NEXT:    slli a3, a3, 24
-; RVA22U64-NEXT:    or a2, a2, a3
 ; RVA22U64-NEXT:    andi a0, a0, 255
 ; RVA22U64-NEXT:    andi a1, a1, 255
+; RVA22U64-NEXT:    andi a4, a4, 255
+; RVA22U64-NEXT:    andi a5, a5, 255
+; RVA22U64-NEXT:    slli a2, a2, 16
+; RVA22U64-NEXT:    slli a3, a3, 24
 ; RVA22U64-NEXT:    slli a1, a1, 8
+; RVA22U64-NEXT:    slli a4, a4, 32
+; RVA22U64-NEXT:    slli a5, a5, 40
+; RVA22U64-NEXT:    or a2, a2, a3
 ; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    or a0, a0, a2
-; RVA22U64-NEXT:    andi a1, a4, 255
-; RVA22U64-NEXT:    slli a1, a1, 32
-; RVA22U64-NEXT:    andi a2, a5, 255
-; RVA22U64-NEXT:    slli a2, a2, 40
-; RVA22U64-NEXT:    or a1, a1, a2
-; RVA22U64-NEXT:    or a0, a0, a1
+; RVA22U64-NEXT:    or a4, a4, a5
+; RVA22U64-NEXT:    or a0, a0, a4
 ; RVA22U64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RVA22U64-NEXT:    vmv.s.x v8, a0
 ; RVA22U64-NEXT:    ret
@@ -3255,10 +3223,10 @@ define <6 x i8> @buildvec_v6i8_pack(i8 %e1, i8 %e2, i8 %e3, i8 %e4, i8 %e5, i8 %
 ; RVA22U64-PACK:       # %bb.0:
 ; RVA22U64-PACK-NEXT:    packh a2, a2, a3
 ; RVA22U64-PACK-NEXT:    packh a0, a0, a1
-; RVA22U64-PACK-NEXT:    packw a0, a0, a2
 ; RVA22U64-PACK-NEXT:    packh a1, a4, a5
-; RVA22U64-PACK-NEXT:    packh a2, a0, a0
-; RVA22U64-PACK-NEXT:    packw a1, a1, a2
+; RVA22U64-PACK-NEXT:    packh a3, a0, a0
+; RVA22U64-PACK-NEXT:    packw a0, a0, a2
+; RVA22U64-PACK-NEXT:    packw a1, a1, a3
 ; RVA22U64-PACK-NEXT:    pack a0, a0, a1
 ; RVA22U64-PACK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RVA22U64-PACK-NEXT:    vmv.s.x v8, a0
@@ -3298,9 +3266,9 @@ define <4 x i16> @buildvec_v4i16_pack(i16 %e1, i16 %e2, i16 %e3, i16 %e4) {
 ; RV32VB:       # %bb.0:
 ; RV32VB-NEXT:    slli a3, a3, 16
 ; RV32VB-NEXT:    zext.h a2, a2
-; RV32VB-NEXT:    or a2, a2, a3
 ; RV32VB-NEXT:    slli a1, a1, 16
 ; RV32VB-NEXT:    zext.h a0, a0
+; RV32VB-NEXT:    or a2, a2, a3
 ; RV32VB-NEXT:    or a0, a0, a1
 ; RV32VB-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32VB-NEXT:    vmv.v.x v8, a0
@@ -3329,11 +3297,11 @@ define <4 x i16> @buildvec_v4i16_pack(i16 %e1, i16 %e2, i16 %e3, i16 %e4) {
 ; RVA22U64:       # %bb.0:
 ; RVA22U64-NEXT:    slli a3, a3, 48
 ; RVA22U64-NEXT:    zext.h a2, a2
-; RVA22U64-NEXT:    slli a2, a2, 32
-; RVA22U64-NEXT:    or a2, a2, a3
 ; RVA22U64-NEXT:    zext.h a0, a0
 ; RVA22U64-NEXT:    zext.h a1, a1
+; RVA22U64-NEXT:    slli a2, a2, 32
 ; RVA22U64-NEXT:    slli a1, a1, 16
+; RVA22U64-NEXT:    or a2, a2, a3
 ; RVA22U64-NEXT:    or a0, a0, a1
 ; RVA22U64-NEXT:    or a0, a0, a2
 ; RVA22U64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
@@ -3455,3 +3423,5 @@ define <4 x i1> @buildvec_i1_splat(i1 %e1) {
   ret <4 x i1> %v4
 }
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV64: {{.*}}

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
index 6cab1bc2185287..a25014295f9e88 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
@@ -63,8 +63,8 @@ define i8 @explode_8xi8(<8 x i8> %v) {
 ; CHECK-NEXT:    vredxor.vs v8, v8, v9
 ; CHECK-NEXT:    vmv.x.s a6, v8
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, a6, a0
 ; CHECK-NEXT:    add a2, a2, a3
+; CHECK-NEXT:    add a0, a6, a0
 ; CHECK-NEXT:    add a2, a2, a4
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a0, a0, a5
@@ -124,17 +124,17 @@ define i8 @explode_16xi8(<16 x i8> %v) {
 ; CHECK-NEXT:    vredxor.vs v8, v8, v9
 ; CHECK-NEXT:    vmv.x.s t6, v8
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, t6, a0
 ; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a5, a5, a6
-; CHECK-NEXT:    add a5, a5, a7
-; CHECK-NEXT:    add a5, a5, t0
-; CHECK-NEXT:    add a0, a0, a5
 ; CHECK-NEXT:    add t1, t1, t2
+; CHECK-NEXT:    add a0, t6, a0
+; CHECK-NEXT:    add a2, a2, a4
+; CHECK-NEXT:    add a5, a5, a7
 ; CHECK-NEXT:    add t1, t1, t3
+; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a5, a5, t0
 ; CHECK-NEXT:    add t1, t1, t4
+; CHECK-NEXT:    add a0, a0, a5
 ; CHECK-NEXT:    add t1, t1, t5
 ; CHECK-NEXT:    add a0, a0, t1
 ; CHECK-NEXT:    ret
@@ -233,8 +233,8 @@ define i16 @explode_8xi16(<8 x i16> %v) {
 ; CHECK-NEXT:    vredxor.vs v8, v8, v9
 ; CHECK-NEXT:    vmv.x.s a6, v8
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, a6, a0
 ; CHECK-NEXT:    add a2, a2, a3
+; CHECK-NEXT:    add a0, a6, a0
 ; CHECK-NEXT:    add a2, a2, a4
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a0, a0, a5
@@ -260,54 +260,54 @@ define i16 @explode_8xi16(<8 x i16> %v) {
 define i16 @explode_16xi16(<16 x i16> %v) {
 ; CHECK-LABEL: explode_16xi16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v8, 2
+; CHECK-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v10, v8, 8
 ; CHECK-NEXT:    vmv.x.s a0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 3
+; CHECK-NEXT:    vslidedown.vi v10, v8, 9
 ; CHECK-NEXT:    vmv.x.s a1, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 4
+; CHECK-NEXT:    vslidedown.vi v10, v8, 10
 ; CHECK-NEXT:    vmv.x.s a2, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 5
+; CHECK-NEXT:    vslidedown.vi v10, v8, 11
 ; CHECK-NEXT:    vmv.x.s a3, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 6
+; CHECK-NEXT:    vslidedown.vi v10, v8, 12
 ; CHECK-NEXT:    vmv.x.s a4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 7
+; CHECK-NEXT:    vslidedown.vi v10, v8, 13
 ; CHECK-NEXT:    vmv.x.s a5, v10
-; CHECK-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v10, v8, 8
+; CHECK-NEXT:    vslidedown.vi v10, v8, 14
 ; CHECK-NEXT:    vmv.x.s a6, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 9
+; CHECK-NEXT:    vslidedown.vi v10, v8, 15
 ; CHECK-NEXT:    vmv.x.s a7, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 10
-; CHECK-NEXT:    vmv.x.s t0, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 11
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v8, 2
+; CHECK-NEXT:    vslidedown.vi v10, v8, 3
+; CHECK-NEXT:    vmv.x.s t0, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 4
 ; CHECK-NEXT:    vmv.x.s t1, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 12
-; CHECK-NEXT:    vmv.x.s t2, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 13
+; CHECK-NEXT:    vslidedown.vi v10, v8, 5
+; CHECK-NEXT:    vmv.x.s t2, v9
+; CHECK-NEXT:    vslidedown.vi v9, v8, 6
 ; CHECK-NEXT:    vmv.x.s t3, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 14
-; CHECK-NEXT:    vmv.x.s t4, v10
-; CHECK-NEXT:    vslidedown.vi v10, v8, 15
-; CHECK-NEXT:    vmv.x.s t5, v10
+; CHECK-NEXT:    vslidedown.vi v10, v8, 7
+; CHECK-NEXT:    vmv.x.s t4, v9
 ; CHECK-NEXT:    vmv.s.x v9, zero
+; CHECK-NEXT:    vmv.x.s t5, v10
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vredxor.vs v8, v8, v9
 ; CHECK-NEXT:    vmv.x.s t6, v8
+; CHECK-NEXT:    add t0, t0, t1
+; CHECK-NEXT:    add t2, t2, t3
+; CHECK-NEXT:    add a0, t5, a0
+; CHECK-NEXT:    add a3, a3, a4
+; CHECK-NEXT:    add t0, t6, t0
+; CHECK-NEXT:    add t2, t2, t4
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, t6, a0
-; CHECK-NEXT:    add a2, a2, a3
-; CHECK-NEXT:    add a2, a2, a4
+; CHECK-NEXT:    add a3, a3, a5
+; CHECK-NEXT:    add t0, t0, t2
 ; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    add a5, a5, a6
-; CHECK-NEXT:    add a5, a5, a7
-; CHECK-NEXT:    add a5, a5, t0
-; CHECK-NEXT:    add a0, a0, a5
-; CHECK-NEXT:    add t1, t1, t2
-; CHECK-NEXT:    add t1, t1, t3
-; CHECK-NEXT:    add t1, t1, t4
-; CHECK-NEXT:    add t1, t1, t5
-; CHECK-NEXT:    add a0, a0, t1
+; CHECK-NEXT:    add a3, a3, a6
+; CHECK-NEXT:    add a0, t0, a0
+; CHECK-NEXT:    add a3, a3, a7
+; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:    ret
   %e0 = extractelement <16 x i16> %v, i32 0
   %e1 = extractelement <16 x i16> %v, i32 1
@@ -401,58 +401,58 @@ define i32 @explode_4xi32(<4 x i32> %v) {
 define i32 @explode_8xi32(<8 x i32> %v) {
 ; RV32-LABEL: explode_8xi32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vmv.x.s a1, v10
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 4
-; RV32-NEXT:    vmv.x.s a2, v10
+; RV32-NEXT:    vmv.x.s a0, v10
 ; RV32-NEXT:    vslidedown.vi v10, v8, 5
-; RV32-NEXT:    vmv.x.s a3, v10
+; RV32-NEXT:    vmv.x.s a1, v10
 ; RV32-NEXT:    vslidedown.vi v10, v8, 6
-; RV32-NEXT:    vmv.x.s a4, v10
+; RV32-NEXT:    vmv.x.s a2, v10
 ; RV32-NEXT:    vslidedown.vi v10, v8, 7
-; RV32-NEXT:    vmv.x.s a5, v10
+; RV32-NEXT:    vmv.x.s a3, v10
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v9, v8, 2
+; RV32-NEXT:    vslidedown.vi v10, v8, 3
+; RV32-NEXT:    vmv.x.s a4, v9
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vmv.x.s a5, v10
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vredxor.vs v8, v8, v9
 ; RV32-NEXT:    vmv.x.s a6, v8
+; RV32-NEXT:    add a4, a4, a5
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a6, a0
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    add a4, a6, a4
 ; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a0, a0, a5
+; RV32-NEXT:    add a0, a4, a0
+; RV32-NEXT:    add a0, a0, a3
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: explode_8xi32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vmv.x.s a0, v10
-; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    vmv.x.s a1, v10
 ; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v10, v8, 4
-; RV64-NEXT:    vmv.x.s a2, v10
+; RV64-NEXT:    vmv.x.s a0, v10
 ; RV64-NEXT:    vslidedown.vi v10, v8, 5
-; RV64-NEXT:    vmv.x.s a3, v10
+; RV64-NEXT:    vmv.x.s a1, v10
 ; RV64-NEXT:    vslidedown.vi v10, v8, 6
-; RV64-NEXT:    vmv.x.s a4, v10
+; RV64-NEXT:    vmv.x.s a2, v10
 ; RV64-NEXT:    vslidedown.vi v10, v8, 7
-; RV64-NEXT:    vmv.x.s a5, v10
+; RV64-NEXT:    vmv.x.s a3, v10
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v9, v8, 2
+; RV64-NEXT:    vslidedown.vi v10, v8, 3
+; RV64-NEXT:    vmv.x.s a4, v9
 ; RV64-NEXT:    vmv.s.x v9, zero
+; RV64-NEXT:    vmv.x.s a5, v10
 ; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV64-NEXT:    vredxor.vs v8, v8, v9
 ; RV64-NEXT:    vmv.x.s a6, v8
+; RV64-NEXT:    add a4, a4, a5
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a0, a6, a0
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    add a4, a6, a4
 ; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    addw a0, a0, a5
+; RV64-NEXT:    add a0, a4, a0
+; RV64-NEXT:    addw a0, a0, a3
 ; RV64-NEXT:    ret
   %e0 = extractelement <8 x i32> %v, i32 0
   %e1 = extractelement <8 x i32> %v, i32 1
@@ -484,24 +484,27 @@ define i32 @explode_16xi32(<16 x i32> %v) {
 ; RV32-NEXT:    addi s0, sp, 128
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 2
-; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 3
-; RV32-NEXT:    vmv.x.s a1, v12
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v12, v8, 4
-; RV32-NEXT:    vmv.x.s a2, v12
+; RV32-NEXT:    vmv.x.s a0, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 5
-; RV32-NEXT:    vmv.x.s a3, v12
+; RV32-NEXT:    vmv.x.s a1, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 6
-; RV32-NEXT:    vmv.x.s a4, v12
+; RV32-NEXT:    vmv.x.s a2, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 7
+; RV32-NEXT:    vmv.x.s a3, v12
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v12, v8, 2
+; RV32-NEXT:    vslidedown.vi v13, v8, 3
+; RV32-NEXT:    mv a4, sp
 ; RV32-NEXT:    vmv.x.s a5, v12
-; RV32-NEXT:    mv a6, sp
+; RV32-NEXT:    vmv.s.x v12, zero
+; RV32-NEXT:    vmv.x.s a6, v13
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v8, (a6)
-; RV32-NEXT:    lw a6, 32(sp)
+; RV32-NEXT:    vse32.v v8, (a4)
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT:    vredxor.vs v8, v8, v12
+; RV32-NEXT:    lw a4, 32(sp)
 ; RV32-NEXT:    lw a7, 36(sp)
 ; RV32-NEXT:    lw t0, 40(sp)
 ; RV32-NEXT:    lw t1, 44(sp)
@@ -509,22 +512,19 @@ define i32 @explode_16xi32(<16 x i32> %v) {
 ; RV32-NEXT:    lw t3, 52(sp)
 ; RV32-NEXT:    lw t4, 56(sp)
 ; RV32-NEXT:    lw t5, 60(sp)
-; RV32-NEXT:    vmv.s.x v9, zero
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vredxor.vs v8, v8, v9
 ; RV32-NEXT:    vmv.x.s t6, v8
+; RV32-NEXT:    add a5, a5, a6
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, t6, a0
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    add a5, t6, a5
 ; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    add a0, a0, a5
+; RV32-NEXT:    add a0, a5, a0
+; RV32-NEXT:    add a3, a3, a4
 ; RV32-NEXT:    add a7, a7, t0
-; RV32-NEXT:    add a7, a7, t1
-; RV32-NEXT:    add a0, a0, a7
 ; RV32-NEXT:    add t2, t2, t3
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    add a7, a7, t1
 ; RV32-NEXT:    add t2, t2, t4
+; RV32-NEXT:    add a0, a0, a7
 ; RV32-NEXT:    add t2, t2, t5
 ; RV32-NEXT:    add a0, a0, t2
 ; RV32-NEXT:    addi sp, s0, -128
@@ -548,24 +548,27 @@ define i32 @explode_16xi32(<16 x i32> %v) {
 ; RV64-NEXT:    addi s0, sp, 128
 ; RV64-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v12, v8, 2
-; RV64-NEXT:    vmv.x.s a0, v12
-; RV64-NEXT:    vslidedown.vi v12, v8, 3
-; RV64-NEXT:    vmv.x.s a1, v12
 ; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v12, v8, 4
-; RV64-NEXT:    vmv.x.s a2, v12
+; RV64-NEXT:    vmv.x.s a0, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 5
-; RV64-NEXT:    vmv.x.s a3, v12
+; RV64-NEXT:    vmv.x.s a1, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 6
-; RV64-NEXT:    vmv.x.s a4, v12
+; RV64-NEXT:    vmv.x.s a2, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 7
+; RV64-NEXT:    vmv.x.s a3, v12
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v12, v8, 2
+; RV64-NEXT:    vslidedown.vi v13, v8, 3
+; RV64-NEXT:    mv a4, sp
 ; RV64-NEXT:    vmv.x.s a5, v12
-; RV64-NEXT:    mv a6, sp
+; RV64-NEXT:    vmv.s.x v12, zero
+; RV64-NEXT:    vmv.x.s a6, v13
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vse32.v v8, (a6)
-; RV64-NEXT:    lw a6, 32(sp)
+; RV64-NEXT:    vse32.v v8, (a4)
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT:    vredxor.vs v8, v8, v12
+; RV64-NEXT:    lw a4, 32(sp)
 ; RV64-NEXT:    lw a7, 36(sp)
 ; RV64-NEXT:    lw t0, 40(sp)
 ; RV64-NEXT:    lw t1, 44(sp)
@@ -573,22 +576,19 @@ define i32 @explode_16xi32(<16 x i32> %v) {
 ; RV64-NEXT:    lw t3, 52(sp)
 ; RV64-NEXT:    lw t4, 56(sp)
 ; RV64-NEXT:    lw t5, 60(sp)
-; RV64-NEXT:    vmv.s.x v9, zero
-; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV64-NEXT:    vredxor.vs v8, v8, v9
 ; RV64-NEXT:    vmv.x.s t6, v8
+; RV64-NEXT:    add a5, a5, a6
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a0, t6, a0
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a2, a2, a4
+; RV64-NEXT:    add a5, t6, a5
 ; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    add a0, a0, a5
+; RV64-NEXT:    add a0, a5, a0
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    add a7, a7, t0
-; RV64-NEXT:    add a7, a7, t1
-; RV64-NEXT:    add a0, a0, a7
 ; RV64-NEXT:    add t2, t2, t3
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    add a7, a7, t1
 ; RV64-NEXT:    add t2, t2, t4
+; RV64-NEXT:    add a0, a0, a7
 ; RV64-NEXT:    add t2, t2, t5
 ; RV64-NEXT:    addw a0, a0, t2
 ; RV64-NEXT:    addi sp, s0, -128
@@ -639,9 +639,9 @@ define i64 @explode_2xi64(<2 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredxor.vs v8, v8, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -666,28 +666,29 @@ define i64 @explode_4xi64(<4 x i64> %v) {
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsrl.vx v12, v10, a0
-; RV32-NEXT:    vmv.x.s a1, v12
-; RV32-NEXT:    vmv.x.s a2, v10
-; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vsrl.vx v12, v10, a0
-; RV32-NEXT:    vmv.x.s a3, v12
-; RV32-NEXT:    vmv.x.s a4, v10
-; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vmv.s.x v12, zero
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vredxor.vs v8, v8, v9
+; RV32-NEXT:    vredxor.vs v12, v8, v12
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 3
+; RV32-NEXT:    vmv.x.s a1, v10
+; RV32-NEXT:    vsrl.vx v10, v10, a0
+; RV32-NEXT:    vmv.x.s a2, v8
+; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    vmv.x.s a3, v10
+; RV32-NEXT:    vmv.x.s a4, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    vsrl.vx v8, v12, a0
+; RV32-NEXT:    vmv.x.s a0, v12
 ; RV32-NEXT:    vmv.x.s a5, v8
-; RV32-NEXT:    add a2, a5, a2
-; RV32-NEXT:    sltu a5, a2, a5
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a0, a5
-; RV32-NEXT:    add a1, a0, a3
-; RV32-NEXT:    add a0, a2, a4
-; RV32-NEXT:    sltu a2, a0, a2
-; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    sltu a6, a1, a0
+; RV32-NEXT:    add a3, a5, a3
+; RV32-NEXT:    add a0, a1, a2
+; RV32-NEXT:    add a3, a3, a6
+; RV32-NEXT:    add a3, a3, a4
+; RV32-NEXT:    sltu a1, a0, a1
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: explode_4xi64:
@@ -721,59 +722,60 @@ define i64 @explode_8xi64(<8 x i64> %v) {
 ; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v12, v8, 2
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsrl.vx v16, v12, a0
-; RV32-NEXT:    vmv.x.s a1, v16
+; RV32-NEXT:    vslidedown.vi v16, v8, 3
 ; RV32-NEXT:    vmv.x.s a2, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 3
-; RV32-NEXT:    vsrl.vx v16, v12, a0
-; RV32-NEXT:    vmv.x.s a3, v16
-; RV32-NEXT:    vmv.x.s a4, v12
+; RV32-NEXT:    vsrl.vx v12, v12, a0
+; RV32-NEXT:    vmv.x.s a1, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 4
-; RV32-NEXT:    vsrl.vx v16, v12, a0
-; RV32-NEXT:    vmv.x.s a5, v16
+; RV32-NEXT:    vmv.x.s a4, v16
+; RV32-NEXT:    vsrl.vx v16, v16, a0
+; RV32-NEXT:    vmv.x.s a3, v16
+; RV32-NEXT:    vslidedown.vi v16, v8, 5
+; RV32-NEXT:    vmv.x.s a5, v12
+; RV32-NEXT:    vsrl.vx v12, v12, a0
 ; RV32-NEXT:    vmv.x.s a6, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 5
-; RV32-NEXT:    vsrl.vx v16, v12, a0
-; RV32-NEXT:    vmv.x.s a7, v16
-; RV32-NEXT:    vmv.x.s t0, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 6
-; RV32-NEXT:    vsrl.vx v16, v12, a0
-; RV32-NEXT:    vmv.x.s t1, v16
-; RV32-NEXT:    vmv.x.s t2, v12
-; RV32-NEXT:    vslidedown.vi v12, v8, 7
-; RV32-NEXT:    vsrl.vx v16, v12, a0
-; RV32-NEXT:    vmv.x.s t3, v16
-; RV32-NEXT:    vmv.x.s t4, v12
-; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    vmv.x.s a7, v16
+; RV32-NEXT:    vsrl.vx v16, v16, a0
+; RV32-NEXT:    vmv.x.s t0, v16
+; RV32-NEXT:    vmv.s.x v16, zero
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vredxor.vs v8, v8, v9
+; RV32-NEXT:    vredxor.vs v16, v8, v16
+; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 7
+; RV32-NEXT:    vmv.x.s t1, v12
+; RV32-NEXT:    vsrl.vx v12, v12, a0
+; RV32-NEXT:    vmv.x.s t2, v8
+; RV32-NEXT:    vsrl.vx v8, v8, a0
+; RV32-NEXT:    vmv.x.s t3, v12
+; RV32-NEXT:    vmv.x.s t4, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v9, v8, a0
-; RV32-NEXT:    vmv.x.s a0, v9
+; RV32-NEXT:    vsrl.vx v8, v16, a0
+; RV32-NEXT:    vmv.x.s a0, v16
 ; RV32-NEXT:    vmv.x.s t5, v8
-; RV32-NEXT:    add a2, t5, a2
-; RV32-NEXT:    sltu t5, a2, t5
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a0, t5
-; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    add a1, t5, a1
 ; RV32-NEXT:    add a4, a2, a4
+; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    sltu a1, a4, a2
-; RV32-NEXT:    add a1, a1, a5
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a6, a4, a6
-; RV32-NEXT:    sltu a1, a6, a4
-; RV32-NEXT:    add a1, a1, a7
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t0, a6, t0
-; RV32-NEXT:    sltu a1, t0, a6
-; RV32-NEXT:    add a1, a1, t1
+; RV32-NEXT:    add a5, a4, a5
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    add a1, a1, a6
+; RV32-NEXT:    sltu a2, a5, a4
+; RV32-NEXT:    add a7, a5, a7
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t2, t0, t2
-; RV32-NEXT:    sltu a1, t2, t0
+; RV32-NEXT:    add a2, a2, t0
+; RV32-NEXT:    sltu a1, a7, a5
+; RV32-NEXT:    add t1, a7, t1
+; RV32-NEXT:    add a2, a0, a2
 ; RV32-NEXT:    add a1, a1, t3
-; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    add a0, t2, t4
-; RV32-NEXT:    sltu a2, a0, t2
+; RV32-NEXT:    sltu a3, t1, a7
+; RV32-NEXT:    add a0, t1, t2
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    add a3, a3, t4
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    sltu a2, a0, t1
 ; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    ret
 ;
@@ -792,21 +794,21 @@ define i64 @explode_8xi64(<8 x i64> %v) {
 ; RV64-NEXT:    vslidedown.vi v12, v8, 2
 ; RV64-NEXT:    vmv.x.s a0, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 3
-; RV64-NEXT:    vmv.x.s a1, v12
-; RV64-NEXT:    mv a2, sp
+; RV64-NEXT:    mv a1, sp
+; RV64-NEXT:    vmv.x.s a2, v12
+; RV64-NEXT:    vmv.s.x v12, zero
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v8, (a2)
-; RV64-NEXT:    ld a2, 32(sp)
+; RV64-NEXT:    vse64.v v8, (a1)
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vredxor.vs v8, v8, v12
+; RV64-NEXT:    ld a1, 32(sp)
 ; RV64-NEXT:    ld a3, 40(sp)
 ; RV64-NEXT:    ld a4, 48(sp)
 ; RV64-NEXT:    ld a5, 56(sp)
-; RV64-NEXT:    vmv.s.x v9, zero
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vredxor.vs v8, v8, v9
 ; RV64-NEXT:    vmv.x.s a6, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a0, a6, a0
 ; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    add a0, a6, a0
+; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    add a0, a0, a3
 ; RV64-NEXT:    add a0, a0, a5
@@ -840,20 +842,20 @@ define i64 @explode_8xi64(<8 x i64> %v) {
 define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV32-LABEL: explode_16xi64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
-; RV32-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s10, 4(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s11, 0(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi sp, sp, -64
+; RV32-NEXT:    .cfi_def_cfa_offset 64
+; RV32-NEXT:    sw s0, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s10, 20(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s11, 16(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset s0, -4
 ; RV32-NEXT:    .cfi_offset s1, -8
 ; RV32-NEXT:    .cfi_offset s2, -12
@@ -866,121 +868,129 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV32-NEXT:    .cfi_offset s9, -40
 ; RV32-NEXT:    .cfi_offset s10, -44
 ; RV32-NEXT:    .cfi_offset s11, -48
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 8 * vlenb
 ; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v16, v8, 2
+; RV32-NEXT:    vslidedown.vi v24, v8, 2
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s a1, v24
-; RV32-NEXT:    vmv.x.s a2, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 3
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s a3, v24
-; RV32-NEXT:    vmv.x.s a4, v16
+; RV32-NEXT:    vslidedown.vi v0, v8, 3
 ; RV32-NEXT:    vslidedown.vi v16, v8, 4
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s a5, v24
+; RV32-NEXT:    vmv.x.s a1, v24
+; RV32-NEXT:    vsrl.vx v24, v24, a0
+; RV32-NEXT:    vmv.x.s a2, v24
+; RV32-NEXT:    vslidedown.vi v24, v8, 5
+; RV32-NEXT:    vmv.x.s a3, v0
+; RV32-NEXT:    vsrl.vx v0, v0, a0
+; RV32-NEXT:    vmv.x.s a4, v0
+; RV32-NEXT:    vslidedown.vi v0, v8, 6
+; RV32-NEXT:    vmv.x.s a5, v16
+; RV32-NEXT:    vsrl.vx v16, v16, a0
 ; RV32-NEXT:    vmv.x.s a6, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 5
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s a7, v24
-; RV32-NEXT:    vmv.x.s t0, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 6
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s t1, v24
-; RV32-NEXT:    vmv.x.s t2, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 7
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s t3, v24
+; RV32-NEXT:    vmv.x.s a7, v24
+; RV32-NEXT:    vsrl.vx v24, v24, a0
+; RV32-NEXT:    vmv.x.s t0, v24
+; RV32-NEXT:    vslidedown.vi v24, v8, 8
+; RV32-NEXT:    vmv.x.s t1, v0
+; RV32-NEXT:    vsrl.vx v0, v0, a0
+; RV32-NEXT:    vmv.x.s t2, v0
+; RV32-NEXT:    vslidedown.vi v0, v8, 9
+; RV32-NEXT:    vmv.x.s t3, v16
+; RV32-NEXT:    vsrl.vx v16, v16, a0
 ; RV32-NEXT:    vmv.x.s t4, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 8
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s t5, v24
-; RV32-NEXT:    vmv.x.s t6, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 9
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s s0, v24
-; RV32-NEXT:    vmv.x.s s1, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 10
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s s2, v24
+; RV32-NEXT:    vmv.x.s t5, v24
+; RV32-NEXT:    vsrl.vx v24, v24, a0
+; RV32-NEXT:    vmv.x.s t6, v24
+; RV32-NEXT:    vslidedown.vi v24, v8, 11
+; RV32-NEXT:    vmv.x.s s0, v0
+; RV32-NEXT:    vsrl.vx v0, v0, a0
+; RV32-NEXT:    vmv.x.s s1, v0
+; RV32-NEXT:    vslidedown.vi v0, v8, 12
+; RV32-NEXT:    vmv.x.s s2, v16
+; RV32-NEXT:    vsrl.vx v16, v16, a0
 ; RV32-NEXT:    vmv.x.s s3, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 11
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s s4, v24
-; RV32-NEXT:    vmv.x.s s5, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 12
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s s6, v24
-; RV32-NEXT:    vmv.x.s s7, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 13
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.x.s s9, v24
-; RV32-NEXT:    vmv.x.s s8, v16
-; RV32-NEXT:    vslidedown.vi v16, v8, 14
-; RV32-NEXT:    vsrl.vx v24, v16, a0
-; RV32-NEXT:    vmv.s.x v17, zero
+; RV32-NEXT:    addi s4, sp, 16
+; RV32-NEXT:    vs8r.v v16, (s4) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv.x.s s4, v24
+; RV32-NEXT:    vsrl.vx v24, v24, a0
+; RV32-NEXT:    vmv.x.s s5, v24
+; RV32-NEXT:    vslidedown.vi v24, v8, 14
+; RV32-NEXT:    vmv.x.s s6, v0
+; RV32-NEXT:    vsrl.vx v0, v0, a0
+; RV32-NEXT:    vmv.x.s s7, v0
+; RV32-NEXT:    vmv.s.x v7, zero
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vredxor.vs v17, v8, v17
+; RV32-NEXT:    vredxor.vs v16, v8, v7
 ; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 15
+; RV32-NEXT:    addi s8, sp, 16
+; RV32-NEXT:    vl8r.v v0, (s8) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv.x.s s8, v0
+; RV32-NEXT:    vsrl.vx v0, v0, a0
+; RV32-NEXT:    vmv.x.s s9, v0
+; RV32-NEXT:    vsrl.vx v0, v24, a0
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v18, v17, a0
-; RV32-NEXT:    vmv.x.s s10, v18
+; RV32-NEXT:    vsrl.vx v17, v16, a0
+; RV32-NEXT:    vmv.x.s s10, v16
 ; RV32-NEXT:    vmv.x.s s11, v17
 ; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vx v0, v8, a0
-; RV32-NEXT:    add a1, s10, a1
+; RV32-NEXT:    vsrl.vx v16, v8, a0
 ; RV32-NEXT:    add a2, s11, a2
-; RV32-NEXT:    sltu a0, a2, s11
-; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    add a0, a0, a3
-; RV32-NEXT:    add a4, a2, a4
-; RV32-NEXT:    sltu a1, a4, a2
-; RV32-NEXT:    add a1, a1, a5
+; RV32-NEXT:    add a1, s10, a1
+; RV32-NEXT:    sltu a0, a1, s10
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    add a0, a0, a4
+; RV32-NEXT:    add a3, a1, a3
+; RV32-NEXT:    sltu a1, a3, a1
+; RV32-NEXT:    add a1, a1, a6
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a6, a4, a6
-; RV32-NEXT:    sltu a1, a6, a4
-; RV32-NEXT:    add a1, a1, a7
+; RV32-NEXT:    add a5, a3, a5
+; RV32-NEXT:    sltu a1, a5, a3
+; RV32-NEXT:    add a1, a1, t0
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t0, a6, t0
-; RV32-NEXT:    sltu a1, t0, a6
-; RV32-NEXT:    add a1, a1, t1
+; RV32-NEXT:    add a7, a5, a7
+; RV32-NEXT:    sltu a1, a7, a5
+; RV32-NEXT:    add a1, a1, t2
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t2, t0, t2
-; RV32-NEXT:    sltu a1, t2, t0
-; RV32-NEXT:    add a1, a1, t3
+; RV32-NEXT:    add t1, a7, t1
+; RV32-NEXT:    sltu a1, t1, a7
+; RV32-NEXT:    add a1, a1, t4
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t4, t2, t4
-; RV32-NEXT:    sltu a1, t4, t2
-; RV32-NEXT:    add a1, a1, t5
+; RV32-NEXT:    add t3, t1, t3
+; RV32-NEXT:    sltu a1, t3, t1
+; RV32-NEXT:    add a1, a1, t6
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add t6, t4, t6
-; RV32-NEXT:    sltu a1, t6, t4
-; RV32-NEXT:    add a1, a1, s0
+; RV32-NEXT:    add t5, t3, t5
+; RV32-NEXT:    sltu a1, t5, t3
+; RV32-NEXT:    add a1, a1, s1
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add s1, t6, s1
-; RV32-NEXT:    sltu a1, s1, t6
-; RV32-NEXT:    add a1, a1, s2
+; RV32-NEXT:    add s0, t5, s0
+; RV32-NEXT:    sltu a1, s0, t5
+; RV32-NEXT:    add a1, a1, s3
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add s3, s1, s3
-; RV32-NEXT:    sltu a1, s3, s1
-; RV32-NEXT:    add a1, a1, s4
+; RV32-NEXT:    add s2, s0, s2
+; RV32-NEXT:    sltu a1, s2, s0
+; RV32-NEXT:    add a1, a1, s5
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add s5, s3, s5
-; RV32-NEXT:    sltu a1, s5, s3
-; RV32-NEXT:    add a1, a1, s6
+; RV32-NEXT:    add s4, s2, s4
+; RV32-NEXT:    sltu a1, s4, s2
+; RV32-NEXT:    add a1, a1, s7
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add s7, s5, s7
-; RV32-NEXT:    sltu a1, s7, s5
+; RV32-NEXT:    add s6, s4, s6
+; RV32-NEXT:    sltu a1, s6, s4
 ; RV32-NEXT:    add a1, a1, s9
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    vmv.x.s a1, v24
-; RV32-NEXT:    add s8, s7, s8
-; RV32-NEXT:    sltu a2, s8, s7
+; RV32-NEXT:    vmv.x.s a1, v0
+; RV32-NEXT:    add s8, s6, s8
+; RV32-NEXT:    sltu a2, s8, s6
 ; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    vmv.x.s a2, v16
+; RV32-NEXT:    vmv.x.s a2, v24
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    vmv.x.s a1, v0
+; RV32-NEXT:    vmv.x.s a1, v16
 ; RV32-NEXT:    add a2, s8, a2
 ; RV32-NEXT:    sltu a3, a2, s8
 ; RV32-NEXT:    add a1, a3, a1
@@ -989,18 +999,22 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV32-NEXT:    add a0, a2, a0
 ; RV32-NEXT:    sltu a2, a0, a2
 ; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s10, 4(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s11, 0(sp) # 4-byte Folded Reload
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add sp, sp, a2
+; RV32-NEXT:    .cfi_def_cfa sp, 64
+; RV32-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 56(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s10, 20(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s11, 16(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore s0
 ; RV32-NEXT:    .cfi_restore s1
 ; RV32-NEXT:    .cfi_restore s2
@@ -1013,7 +1027,7 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV32-NEXT:    .cfi_restore s9
 ; RV32-NEXT:    .cfi_restore s10
 ; RV32-NEXT:    .cfi_restore s11
-; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    addi sp, sp, 64
 ; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
@@ -1032,11 +1046,14 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV64-NEXT:    vslidedown.vi v16, v8, 2
 ; RV64-NEXT:    vmv.x.s a0, v16
 ; RV64-NEXT:    vslidedown.vi v16, v8, 3
-; RV64-NEXT:    vmv.x.s a1, v16
-; RV64-NEXT:    mv a2, sp
+; RV64-NEXT:    mv a1, sp
+; RV64-NEXT:    vmv.x.s a2, v16
+; RV64-NEXT:    vmv.s.x v16, zero
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vse64.v v8, (a2)
-; RV64-NEXT:    ld a2, 32(sp)
+; RV64-NEXT:    vse64.v v8, (a1)
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vredxor.vs v8, v8, v16
+; RV64-NEXT:    ld a1, 32(sp)
 ; RV64-NEXT:    ld a3, 40(sp)
 ; RV64-NEXT:    ld a4, 48(sp)
 ; RV64-NEXT:    ld a5, 56(sp)
@@ -1048,20 +1065,17 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV64-NEXT:    ld t3, 104(sp)
 ; RV64-NEXT:    ld t4, 112(sp)
 ; RV64-NEXT:    ld t5, 120(sp)
-; RV64-NEXT:    vmv.s.x v9, zero
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vredxor.vs v8, v8, v9
 ; RV64-NEXT:    vmv.x.s t6, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a0, t6, a0
 ; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    add a0, t6, a0
+; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    add a0, a0, a3
 ; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    add a5, a5, a7
-; RV64-NEXT:    add a0, a0, a5
 ; RV64-NEXT:    add t0, t0, t1
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    add a5, a5, a7
 ; RV64-NEXT:    add t0, t0, t2
+; RV64-NEXT:    add a0, a0, a5
 ; RV64-NEXT:    add t0, t0, t3
 ; RV64-NEXT:    add a0, a0, t0
 ; RV64-NEXT:    add t4, t4, t5
@@ -1116,22 +1130,22 @@ define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) {
 ; RV32-NEXT:    vslidedown.vi v12, v8, 2
 ; RV32-NEXT:    vmv.x.s a0, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 3
-; RV32-NEXT:    vmv.x.s a1, v12
-; RV32-NEXT:    vmv.x.s a2, v9
+; RV32-NEXT:    vmv.x.s a1, v9
+; RV32-NEXT:    vmv.x.s a2, v12
 ; RV32-NEXT:    vslidedown.vi v12, v9, 1
 ; RV32-NEXT:    vmv.x.s a3, v12
 ; RV32-NEXT:    vslidedown.vi v12, v9, 2
 ; RV32-NEXT:    vmv.x.s a4, v12
 ; RV32-NEXT:    vslidedown.vi v9, v9, 3
-; RV32-NEXT:    vmv.x.s a5, v9
-; RV32-NEXT:    vmv.x.s a6, v10
+; RV32-NEXT:    vmv.x.s a5, v10
+; RV32-NEXT:    vmv.x.s a6, v9
 ; RV32-NEXT:    vslidedown.vi v9, v10, 1
 ; RV32-NEXT:    vmv.x.s a7, v9
 ; RV32-NEXT:    vslidedown.vi v9, v10, 2
 ; RV32-NEXT:    vmv.x.s t0, v9
 ; RV32-NEXT:    vslidedown.vi v9, v10, 3
-; RV32-NEXT:    vmv.x.s t1, v9
-; RV32-NEXT:    vmv.x.s t2, v11
+; RV32-NEXT:    vmv.x.s t1, v11
+; RV32-NEXT:    vmv.x.s t2, v9
 ; RV32-NEXT:    vslidedown.vi v9, v11, 1
 ; RV32-NEXT:    vmv.x.s t3, v9
 ; RV32-NEXT:    vslidedown.vi v9, v11, 2
@@ -1142,18 +1156,18 @@ define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) {
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vredxor.vs v8, v8, v9
 ; RV32-NEXT:    vmv.x.s t6, v8
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, t6, a0
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a2, a2, a4
 ; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a5, a5, a6
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    add a5, a6, a5
+; RV32-NEXT:    add t1, t2, t1
+; RV32-NEXT:    add a0, t6, a0
+; RV32-NEXT:    add a1, a1, a4
 ; RV32-NEXT:    add a5, a5, a7
-; RV32-NEXT:    add a5, a5, t0
-; RV32-NEXT:    add a0, a0, a5
-; RV32-NEXT:    add t1, t1, t2
 ; RV32-NEXT:    add t1, t1, t3
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a5, a5, t0
 ; RV32-NEXT:    add t1, t1, t4
+; RV32-NEXT:    add a0, a0, a5
 ; RV32-NEXT:    add t1, t1, t5
 ; RV32-NEXT:    add a0, a0, t1
 ; RV32-NEXT:    ret
@@ -1164,22 +1178,22 @@ define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) {
 ; RV64-NEXT:    vslidedown.vi v12, v8, 2
 ; RV64-NEXT:    vmv.x.s a0, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 3
-; RV64-NEXT:    vmv.x.s a1, v12
-; RV64-NEXT:    vmv.x.s a2, v9
+; RV64-NEXT:    vmv.x.s a1, v9
+; RV64-NEXT:    vmv.x.s a2, v12
 ; RV64-NEXT:    vslidedown.vi v12, v9, 1
 ; RV64-NEXT:    vmv.x.s a3, v12
 ; RV64-NEXT:    vslidedown.vi v12, v9, 2
 ; RV64-NEXT:    vmv.x.s a4, v12
 ; RV64-NEXT:    vslidedown.vi v9, v9, 3
-; RV64-NEXT:    vmv.x.s a5, v9
-; RV64-NEXT:    vmv.x.s a6, v10
+; RV64-NEXT:    vmv.x.s a5, v10
+; RV64-NEXT:    vmv.x.s a6, v9
 ; RV64-NEXT:    vslidedown.vi v9, v10, 1
 ; RV64-NEXT:    vmv.x.s a7, v9
 ; RV64-NEXT:    vslidedown.vi v9, v10, 2
 ; RV64-NEXT:    vmv.x.s t0, v9
 ; RV64-NEXT:    vslidedown.vi v9, v10, 3
-; RV64-NEXT:    vmv.x.s t1, v9
-; RV64-NEXT:    vmv.x.s t2, v11
+; RV64-NEXT:    vmv.x.s t1, v11
+; RV64-NEXT:    vmv.x.s t2, v9
 ; RV64-NEXT:    vslidedown.vi v9, v11, 1
 ; RV64-NEXT:    vmv.x.s t3, v9
 ; RV64-NEXT:    vslidedown.vi v9, v11, 2
@@ -1190,18 +1204,18 @@ define i32 @explode_16xi32_exact_vlen(<16 x i32> %v) vscale_range(2, 2) {
 ; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV64-NEXT:    vredxor.vs v8, v8, v9
 ; RV64-NEXT:    vmv.x.s t6, v8
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    add a0, t6, a0
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    add a2, a2, a4
 ; RV64-NEXT:    add a0, a0, a2
-; RV64-NEXT:    add a5, a5, a6
+; RV64-NEXT:    add a1, a1, a3
+; RV64-NEXT:    add a5, a6, a5
+; RV64-NEXT:    add t1, t2, t1
+; RV64-NEXT:    add a0, t6, a0
+; RV64-NEXT:    add a1, a1, a4
 ; RV64-NEXT:    add a5, a5, a7
-; RV64-NEXT:    add a5, a5, t0
-; RV64-NEXT:    add a0, a0, a5
-; RV64-NEXT:    add t1, t1, t2
 ; RV64-NEXT:    add t1, t1, t3
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a5, a5, t0
 ; RV64-NEXT:    add t1, t1, t4
+; RV64-NEXT:    add a0, a0, a5
 ; RV64-NEXT:    add t1, t1, t5
 ; RV64-NEXT:    addw a0, a0, t1
 ; RV64-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
index c65e7aec712ae4..66af5718fb9dc5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
@@ -54,12 +54,10 @@ define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; V128-NEXT:    vmv1r.v v12, v9
 ; V128-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; V128-NEXT:    vid.v v9
+; V128-NEXT:    vmv.v.i v0, 10
 ; V128-NEXT:    vsrl.vi v14, v9, 1
-; V128-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; V128-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
 ; V128-NEXT:    vrgatherei16.vv v10, v8, v14
-; V128-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; V128-NEXT:    vmv.v.i v0, 10
-; V128-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; V128-NEXT:    vrgatherei16.vv v10, v12, v14, v0.t
 ; V128-NEXT:    vmv.v.v v8, v10
 ; V128-NEXT:    ret
@@ -191,10 +189,12 @@ define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
 ; V128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; V128-NEXT:    vwaddu.vv v10, v8, v8
 ; V128-NEXT:    li a0, -1
+; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; V128-NEXT:    vid.v v11
+; V128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; V128-NEXT:    vwmaccu.vx v10, a0, v8
 ; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; V128-NEXT:    vid.v v8
-; V128-NEXT:    vsrl.vi v8, v8, 1
+; V128-NEXT:    vsrl.vi v8, v11, 1
 ; V128-NEXT:    vmv.v.i v0, 10
 ; V128-NEXT:    vadd.vi v8, v8, 1
 ; V128-NEXT:    vrgather.vv v10, v9, v8, v0.t
@@ -206,10 +206,12 @@ define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
 ; V512-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; V512-NEXT:    vwaddu.vv v10, v8, v8
 ; V512-NEXT:    li a0, -1
+; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
+; V512-NEXT:    vid.v v11
+; V512-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; V512-NEXT:    vwmaccu.vx v10, a0, v8
 ; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, mu
-; V512-NEXT:    vid.v v8
-; V512-NEXT:    vsrl.vi v8, v8, 1
+; V512-NEXT:    vsrl.vi v8, v11, 1
 ; V512-NEXT:    vmv.v.i v0, 10
 ; V512-NEXT:    vadd.vi v8, v8, 1
 ; V512-NEXT:    vrgather.vv v10, v9, v8, v0.t
@@ -409,26 +411,27 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
 ; V128-NEXT:    slli a0, a0, 3
 ; V128-NEXT:    sub sp, sp, a0
 ; V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; V128-NEXT:    vmv8r.v v0, v16
-; V128-NEXT:    addi a0, sp, 16
-; V128-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; V128-NEXT:    vmv8r.v v24, v16
 ; V128-NEXT:    vmv8r.v v16, v8
+; V128-NEXT:    vmv8r.v v8, v24
+; V128-NEXT:    addi a0, sp, 16
+; V128-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT:    vslidedown.vi v8, v0, 16
-; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT:    vwaddu.vv v24, v0, v8
+; V128-NEXT:    vslidedown.vi v0, v24, 16
 ; V128-NEXT:    li a0, -1
-; V128-NEXT:    vwmaccu.vx v24, a0, v8
+; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; V128-NEXT:    vwaddu.vv v24, v8, v0
+; V128-NEXT:    vwmaccu.vx v24, a0, v0
 ; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; V128-NEXT:    vslidedown.vi v0, v16, 16
+; V128-NEXT:    lui a1, 699051
+; V128-NEXT:    li a2, 32
 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; V128-NEXT:    vwaddu.vv v8, v0, v16
-; V128-NEXT:    vwmaccu.vx v8, a0, v16
-; V128-NEXT:    lui a1, 699051
 ; V128-NEXT:    addi a1, a1, -1366
 ; V128-NEXT:    vmv.s.x v0, a1
-; V128-NEXT:    li a1, 32
-; V128-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; V128-NEXT:    vwmaccu.vx v8, a0, v16
+; V128-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; V128-NEXT:    vmerge.vvm v24, v8, v24, v0
 ; V128-NEXT:    addi a1, sp, 16
 ; V128-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index 1e77b3710928d2..e46587f58b4eb6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -100,8 +100,8 @@ define <4 x i16> @vrgather_shuffle_xv_v4i16(<4 x i16> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vid.v v9
-; CHECK-NEXT:    vrsub.vi v10, v9, 4
 ; CHECK-NEXT:    vmv.v.i v0, 12
+; CHECK-NEXT:    vrsub.vi v10, v9, 4
 ; CHECK-NEXT:    vmv.v.i v9, 5
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v9
@@ -116,8 +116,8 @@ define <4 x i16> @vrgather_shuffle_vx_v4i16(<4 x i16> %x) {
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    li a0, 3
-; CHECK-NEXT:    vmul.vx v10, v9, a0
 ; CHECK-NEXT:    vmv.v.i v0, 3
+; CHECK-NEXT:    vmul.vx v10, v9, a0
 ; CHECK-NEXT:    vmv.v.i v9, 5
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10, v0.t
 ; CHECK-NEXT:    vmv1r.v v8, v9
@@ -157,38 +157,40 @@ define <8 x i64> @vrgather_permute_shuffle_uv_v8i64(<8 x i64> %x) {
 define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) {
 ; RV32-LABEL: vrgather_shuffle_vv_v8i64:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    lui a0, %hi(.LCPI11_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI11_0)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v16, 2
-; RV32-NEXT:    li a0, 5
-; RV32-NEXT:    lui a1, %hi(.LCPI11_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI11_0)
-; RV32-NEXT:    vle16.v v20, (a1)
-; RV32-NEXT:    vslide1down.vx v21, v16, a0
+; RV32-NEXT:    vle16.v v20, (a0)
+; RV32-NEXT:    vmv.v.i v21, 2
 ; RV32-NEXT:    li a0, 164
+; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32-NEXT:    vrgatherei16.vv v16, v8, v20
 ; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    li a0, 5
+; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32-NEXT:    vslide1down.vx v8, v21, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v20
-; RV32-NEXT:    vrgatherei16.vv v16, v12, v21, v0.t
+; RV32-NEXT:    vrgatherei16.vv v16, v12, v8, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vrgather_shuffle_vv_v8i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    li a0, 164
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vmv.s.x v0, a0
 ; RV64-NEXT:    lui a0, 327683
 ; RV64-NEXT:    slli a0, a0, 3
 ; RV64-NEXT:    addi a0, a0, 1
 ; RV64-NEXT:    slli a0, a0, 17
 ; RV64-NEXT:    addi a0, a0, 1
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v20, a0
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v16, v8, v20
-; RV64-NEXT:    li a0, 164
-; RV64-NEXT:    vmv.s.x v0, a0
 ; RV64-NEXT:    lui a0, 163841
 ; RV64-NEXT:    slli a0, a0, 4
 ; RV64-NEXT:    addi a0, a0, 1
 ; RV64-NEXT:    slli a0, a0, 17
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT:    vrgatherei16.vv v16, v8, v20
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
@@ -205,15 +207,15 @@ define <8 x i64> @vrgather_shuffle_xv_v8i64(<8 x i64> %x) {
 ; RV32-NEXT:    lui a0, %hi(.LCPI12_0)
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI12_0)
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV32-NEXT:    vle16.v v16, (a0)
-; RV32-NEXT:    vmv.v.i v20, -1
+; RV32-NEXT:    vmv.v.i v16, -1
+; RV32-NEXT:    vle16.v v20, (a0)
 ; RV32-NEXT:    lui a0, %hi(.LCPI12_1)
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI12_1)
-; RV32-NEXT:    vle16.v v17, (a0)
+; RV32-NEXT:    vle16.v v21, (a0)
 ; RV32-NEXT:    li a0, 113
 ; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vrgatherei16.vv v12, v20, v16
-; RV32-NEXT:    vrgatherei16.vv v12, v8, v17, v0.t
+; RV32-NEXT:    vrgatherei16.vv v12, v16, v20
+; RV32-NEXT:    vrgatherei16.vv v12, v8, v21, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    ret
 ;
@@ -241,14 +243,14 @@ define <8 x i64> @vrgather_shuffle_vx_v8i64(<8 x i64> %x) {
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI13_0)
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV32-NEXT:    vle16.v v16, (a0)
-; RV32-NEXT:    vrgatherei16.vv v12, v8, v16
 ; RV32-NEXT:    lui a0, %hi(.LCPI13_1)
 ; RV32-NEXT:    addi a0, a0, %lo(.LCPI13_1)
-; RV32-NEXT:    vle16.v v8, (a0)
+; RV32-NEXT:    vle16.v v17, (a0)
 ; RV32-NEXT:    li a0, 140
 ; RV32-NEXT:    vmv.s.x v0, a0
-; RV32-NEXT:    vmv.v.i v16, 5
-; RV32-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
+; RV32-NEXT:    vrgatherei16.vv v12, v8, v16
+; RV32-NEXT:    vmv.v.i v8, 5
+; RV32-NEXT:    vrgatherei16.vv v12, v8, v17, v0.t
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    ret
 ;
@@ -435,9 +437,9 @@ define <8 x i8> @splat_ve2_we0_ins_i2we4(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 4
 ; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    li a0, 70
 ; CHECK-NEXT:    vsetivli zero, 3, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v11, v10, 2
-; CHECK-NEXT:    li a0, 70
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vi v10, v8, 2
@@ -454,13 +456,13 @@ define <8 x i8> @splat_ve2_we0_ins_i2ve4_i5we6(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 6
 ; CHECK-NEXT:    vmv.v.i v11, 0
-; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v11, v10, 5
 ; CHECK-NEXT:    lui a0, 8256
 ; CHECK-NEXT:    addi a0, a0, 2
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    li a0, 98
+; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v11, v10, 5
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
@@ -722,17 +724,18 @@ define <8 x i8> @shuffle_v64i8_v8i8(<64 x i8> %wide.vec) {
 ; CHECK-LABEL: shuffle_v64i8_v8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    li a1, 240
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    lui a1, 98561
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vid.v v12
 ; CHECK-NEXT:    vsll.vi v14, v12, 3
 ; CHECK-NEXT:    vrgather.vv v12, v8, v14
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    li a1, 240
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    lui a1, 98561
 ; CHECK-NEXT:    addi a1, a1, -2048
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v12, v8, v10, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index f894691b993e47..cba8de82ec41b9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -1045,47 +1045,47 @@ define void @urem_v2i64(ptr %x, ptr %y) {
 define void @mulhu_v16i8(ptr %x) {
 ; CHECK-LABEL: mulhu_v16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v9, (a0)
 ; CHECK-NEXT:    lui a1, 3
-; CHECK-NEXT:    addi a1, a1, -2044
-; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    lui a1, 1
-; CHECK-NEXT:    addi a2, a1, 32
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.s.x v8, a2
 ; CHECK-NEXT:    lui a2, %hi(.LCPI65_0)
 ; CHECK-NEXT:    addi a2, a2, %lo(.LCPI65_0)
 ; CHECK-NEXT:    vle8.v v11, (a2)
-; CHECK-NEXT:    li a2, -128
+; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    addi a1, a1, -2044
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    addi a1, a2, 32
+; CHECK-NEXT:    vmv.s.x v8, a1
+; CHECK-NEXT:    li a1, -128
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmerge.vxm v12, v10, a2, v0
+; CHECK-NEXT:    vmerge.vxm v12, v10, a1, v0
+; CHECK-NEXT:    li a1, 513
+; CHECK-NEXT:    vmv.v.i v13, 4
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vsrl.vv v8, v9, v8
-; CHECK-NEXT:    vmulhu.vv v8, v8, v11
-; CHECK-NEXT:    vsub.vv v9, v9, v8
-; CHECK-NEXT:    vmulhu.vv v9, v9, v12
-; CHECK-NEXT:    vadd.vv v9, v9, v8
-; CHECK-NEXT:    li a2, 513
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a2
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    addi a1, a2, 78
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 4
-; CHECK-NEXT:    vmerge.vim v10, v8, 1, v0
-; CHECK-NEXT:    addi a1, a1, 78
+; CHECK-NEXT:    vmerge.vim v10, v13, 1, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsrl.vv v8, v9, v8
+; CHECK-NEXT:    vmulhu.vv v8, v8, v11
+; CHECK-NEXT:    vmerge.vim v10, v10, 3, v0
 ; CHECK-NEXT:    lui a1, 8
 ; CHECK-NEXT:    addi a1, a1, 304
-; CHECK-NEXT:    vmv.s.x v8, a1
+; CHECK-NEXT:    vsub.vv v9, v9, v8
+; CHECK-NEXT:    vmulhu.vv v9, v9, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmerge.vim v10, v10, 3, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 2, v0
-; CHECK-NEXT:    vsrl.vv v8, v9, v8
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vmerge.vim v9, v10, 2, v0
+; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <16 x i8>, ptr %x
@@ -1100,31 +1100,31 @@ define void @mulhu_v8i16(ptr %x) {
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    lui a1, 1048568
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, tu, ma
-; CHECK-NEXT:    vmv.s.x v10, a1
+; CHECK-NEXT:    vmv.v.i v10, 1
+; CHECK-NEXT:    li a1, 33
+; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    lui a1, %hi(.LCPI66_0)
 ; CHECK-NEXT:    addi a1, a1, %lo(.LCPI66_0)
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v11, (a1)
-; CHECK-NEXT:    vmv.v.i v12, 1
+; CHECK-NEXT:    vmv.v.i v11, 3
+; CHECK-NEXT:    vle16.v v12, (a1)
+; CHECK-NEXT:    vmerge.vim v11, v11, 2, v0
+; CHECK-NEXT:    vmv.v.i v13, 0
 ; CHECK-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
-; CHECK-NEXT:    vslideup.vi v9, v12, 6
+; CHECK-NEXT:    vslideup.vi v9, v10, 6
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vsrl.vv v9, v8, v9
-; CHECK-NEXT:    vmulhu.vv v9, v9, v11
+; CHECK-NEXT:    vmulhu.vv v9, v9, v12
+; CHECK-NEXT:    lui a1, 1048568
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, tu, ma
+; CHECK-NEXT:    vmv.s.x v13, a1
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vmulhu.vv v8, v8, v10
+; CHECK-NEXT:    vmulhu.vv v8, v8, v13
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
-; CHECK-NEXT:    li a1, 33
-; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    vmv.v.i v9, 3
-; CHECK-NEXT:    vmerge.vim v9, v9, 2, v0
 ; CHECK-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
-; CHECK-NEXT:    vslideup.vi v9, v12, 6
+; CHECK-NEXT:    vslideup.vi v11, v10, 6
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vsrl.vv v8, v8, v9
+; CHECK-NEXT:    vsrl.vv v8, v8, v11
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %a = load <8 x i16>, ptr %x
@@ -1157,22 +1157,22 @@ define void @mulhu_v4i32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    lui a1, %hi(.LCPI68_0)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI68_0)
-; CHECK-NEXT:    vle32.v v9, (a1)
 ; CHECK-NEXT:    lui a1, 524288
+; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    vmv.s.x v10, a1
-; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    lui a1, %hi(.LCPI68_0)
+; CHECK-NEXT:    addi a1, a1, %lo(.LCPI68_0)
+; CHECK-NEXT:    vle32.v v11, (a1)
 ; CHECK-NEXT:    vsetivli zero, 3, e32, m1, tu, ma
-; CHECK-NEXT:    vslideup.vi v11, v10, 2
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmulhu.vv v9, v8, v9
-; CHECK-NEXT:    vsub.vv v8, v8, v9
-; CHECK-NEXT:    vmulhu.vv v8, v8, v11
-; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    vslideup.vi v9, v10, 2
 ; CHECK-NEXT:    lui a1, 4128
 ; CHECK-NEXT:    addi a1, a1, 514
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmulhu.vv v10, v8, v11
+; CHECK-NEXT:    vsub.vv v8, v8, v10
+; CHECK-NEXT:    vmulhu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv.s.x v9, a1
+; CHECK-NEXT:    vadd.vv v8, v8, v10
 ; CHECK-NEXT:    vsext.vf4 v10, v9
 ; CHECK-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-NEXT:    vse32.v v8, (a0)
@@ -1192,10 +1192,10 @@ define void @mulhu_v2i64(ptr %x) {
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI69_0)
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vle32.v v9, (a1)
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vmulhu.vv v8, v8, v9
 ; RV32-NEXT:    lui a1, 32
 ; RV32-NEXT:    addi a1, a1, 1
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vmulhu.vv v8, v8, v9
 ; RV32-NEXT:    vmv.s.x v9, a1
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vsext.vf4 v10, v9
@@ -1209,16 +1209,16 @@ define void @mulhu_v2i64(ptr %x) {
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    lui a1, 838861
+; RV64-NEXT:    lui a2, 699051
 ; RV64-NEXT:    addiw a1, a1, -819
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    addiw a2, a2, -1365
+; RV64-NEXT:    slli a3, a1, 32
+; RV64-NEXT:    add a1, a1, a3
+; RV64-NEXT:    slli a3, a2, 32
+; RV64-NEXT:    add a2, a2, a3
 ; RV64-NEXT:    vmv.v.x v9, a1
-; RV64-NEXT:    lui a1, 699051
-; RV64-NEXT:    addiw a1, a1, -1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
-; RV64-NEXT:    vmv.s.x v9, a1
+; RV64-NEXT:    vmv.s.x v9, a2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; RV64-NEXT:    vmulhu.vv v8, v8, v9
 ; RV64-NEXT:    vid.v v9
@@ -1246,9 +1246,9 @@ define void @mulhs_v16i8(ptr %x) {
 ; CHECK-NEXT:    li a1, 57
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vxm v9, v9, a1, v0
+; CHECK-NEXT:    vmv.v.i v10, 7
 ; CHECK-NEXT:    vmulhu.vv v8, v8, v9
-; CHECK-NEXT:    vmv.v.i v9, 7
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -1263,11 +1263,11 @@ define void @mulhs_v8i16(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    li a1, 105
+; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    lui a1, 5
 ; CHECK-NEXT:    addi a1, a1, -1755
 ; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    li a1, 105
-; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    lui a1, 1048571
 ; CHECK-NEXT:    addi a1, a1, 1755
 ; CHECK-NEXT:    vmerge.vxm v9, v9, a1, v0
@@ -1309,9 +1309,9 @@ define void @mulhs_v4i32(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    lui a1, 419430
+; RV32-NEXT:    vmv.v.i v0, 5
 ; RV32-NEXT:    addi a1, a1, 1639
 ; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vmv.v.i v0, 5
 ; RV32-NEXT:    lui a1, 629146
 ; RV32-NEXT:    addi a1, a1, -1639
 ; RV32-NEXT:    vmerge.vxm v9, v9, a1, v0
@@ -1349,28 +1349,27 @@ define void @mulhs_v2i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a2, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v9, a2
+; RV32-NEXT:    vid.v v9
+; RV32-NEXT:    addi a2, a1, 1365
+; RV32-NEXT:    vmv.v.x v10, a2
+; RV32-NEXT:    li a2, 63
 ; RV32-NEXT:    addi a1, a1, 1366
 ; RV32-NEXT:    vsetvli zero, zero, e32, m1, tu, ma
-; RV32-NEXT:    vmv.s.x v9, a1
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vmulh.vv v9, v8, v9
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vid.v v10
-; RV32-NEXT:    vsrl.vi v10, v10, 1
-; RV32-NEXT:    vrsub.vi v10, v10, 0
-; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vmadd.vv v10, v8, v9
-; RV32-NEXT:    li a1, 63
-; RV32-NEXT:    vsrl.vx v8, v10, a1
+; RV32-NEXT:    vmv.s.x v10, a1
 ; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    vmv.s.x v9, a1
+; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32-NEXT:    vsrl.vi v9, v9, 1
+; RV32-NEXT:    vrsub.vi v9, v9, 0
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vmulh.vv v10, v8, v10
+; RV32-NEXT:    vmadd.vv v9, v8, v10
+; RV32-NEXT:    vmv.s.x v8, a1
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vsext.vf4 v11, v9
+; RV32-NEXT:    vsext.vf4 v10, v8
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vsra.vv v9, v10, v11
+; RV32-NEXT:    vsrl.vx v8, v9, a2
+; RV32-NEXT:    vsra.vv v9, v9, v10
 ; RV32-NEXT:    vadd.vv v8, v9, v8
 ; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    ret
@@ -1381,21 +1380,21 @@ define void @mulhs_v2i64(ptr %x) {
 ; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    lui a1, 349525
 ; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    lui a2, %hi(.LCPI74_0)
+; RV64-NEXT:    vid.v v9
 ; RV64-NEXT:    ld a2, %lo(.LCPI74_0)(a2)
-; RV64-NEXT:    slli a3, a1, 32
-; RV64-NEXT:    add a1, a1, a3
-; RV64-NEXT:    vmv.v.x v9, a1
+; RV64-NEXT:    vmv.v.x v10, a1
+; RV64-NEXT:    li a1, 63
+; RV64-NEXT:    vrsub.vi v11, v9, 0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
-; RV64-NEXT:    vmv.s.x v9, a2
+; RV64-NEXT:    vmv.s.x v10, a2
 ; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV64-NEXT:    vmulh.vv v9, v8, v9
-; RV64-NEXT:    vid.v v10
-; RV64-NEXT:    vrsub.vi v11, v10, 0
-; RV64-NEXT:    vmadd.vv v11, v8, v9
-; RV64-NEXT:    li a1, 63
+; RV64-NEXT:    vmulh.vv v10, v8, v10
+; RV64-NEXT:    vmadd.vv v11, v8, v10
 ; RV64-NEXT:    vsrl.vx v8, v11, a1
-; RV64-NEXT:    vsra.vv v9, v11, v10
+; RV64-NEXT:    vsra.vv v9, v11, v9
 ; RV64-NEXT:    vadd.vv v8, v9, v8
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    ret
@@ -3156,47 +3155,47 @@ define void @mulhu_v32i8(ptr %x) {
 ; CHECK-LABEL: mulhu_v32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    lui a2, 163907
+; CHECK-NEXT:    addi a2, a2, -2044
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a2
+; CHECK-NEXT:    lui a2, 66049
+; CHECK-NEXT:    addi a2, a2, 32
+; CHECK-NEXT:    vmv.s.x v8, a2
+; CHECK-NEXT:    li a2, -128
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vle8.v v10, (a0)
 ; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    lui a1, 163907
-; CHECK-NEXT:    addi a1, a1, -2044
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    lui a1, 66049
-; CHECK-NEXT:    addi a1, a1, 32
-; CHECK-NEXT:    vmv.s.x v8, a1
+; CHECK-NEXT:    vmerge.vxm v10, v12, a2, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI181_0)
 ; CHECK-NEXT:    addi a1, a1, %lo(.LCPI181_0)
-; CHECK-NEXT:    vle8.v v14, (a1)
-; CHECK-NEXT:    li a1, -128
-; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vxm v16, v12, a1, v0
+; CHECK-NEXT:    vle8.v v14, (a0)
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
-; CHECK-NEXT:    vsrl.vv v8, v10, v8
-; CHECK-NEXT:    vmulhu.vv v8, v8, v14
-; CHECK-NEXT:    vsub.vv v10, v10, v8
-; CHECK-NEXT:    vmulhu.vv v10, v10, v16
-; CHECK-NEXT:    vadd.vv v10, v10, v8
+; CHECK-NEXT:    vle8.v v12, (a1)
 ; CHECK-NEXT:    lui a1, 8208
 ; CHECK-NEXT:    addi a1, a1, 513
+; CHECK-NEXT:    vsrl.vv v8, v14, v8
+; CHECK-NEXT:    vmulhu.vv v12, v8, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 4
-; CHECK-NEXT:    vmerge.vim v12, v8, 1, v0
 ; CHECK-NEXT:    lui a1, 66785
 ; CHECK-NEXT:    addi a1, a1, 78
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    lui a1, 529160
-; CHECK-NEXT:    addi a1, a1, 304
 ; CHECK-NEXT:    vmv.s.x v8, a1
+; CHECK-NEXT:    lui a1, 529160
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v12, v12, 3, v0
+; CHECK-NEXT:    vsub.vv v14, v14, v12
+; CHECK-NEXT:    vmulhu.vv v10, v14, v10
+; CHECK-NEXT:    vmv.v.i v14, 4
+; CHECK-NEXT:    addi a1, a1, 304
+; CHECK-NEXT:    vmerge.vim v14, v14, 1, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmv.s.x v9, a1
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v12, 2, v0
+; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v14, v14, 3, v0
+; CHECK-NEXT:    vadd.vv v10, v10, v12
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v14, 2, v0
 ; CHECK-NEXT:    vsrl.vv v8, v10, v8
 ; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -3212,36 +3211,37 @@ define void @mulhu_v16i16(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v10, (a0)
 ; RV32-NEXT:    li a1, 257
-; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    lui a1, 1048568
 ; RV32-NEXT:    vmerge.vxm v12, v8, a1, v0
 ; RV32-NEXT:    lui a1, 4
+; RV32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV32-NEXT:    vmv.v.i v14, 0
 ; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV32-NEXT:    vmv.s.x v8, a1
-; RV32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v9, 0
+; RV32-NEXT:    lui a1, 2
+; RV32-NEXT:    addi a1, a1, 289
+; RV32-NEXT:    vmv.s.x v9, a1
 ; RV32-NEXT:    lui a1, %hi(.LCPI182_0)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI182_0)
-; RV32-NEXT:    vle16.v v14, (a1)
+; RV32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; RV32-NEXT:    vmv.v.i v15, 3
 ; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vmerge.vim v9, v9, 1, v0
+; RV32-NEXT:    vmerge.vim v14, v14, 1, v0
+; RV32-NEXT:    vmv1r.v v0, v9
+; RV32-NEXT:    vmerge.vim v9, v15, 2, v0
+; RV32-NEXT:    vle16.v v16, (a1)
+; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vmerge.vim v8, v9, 1, v0
 ; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    vsext.vf2 v16, v9
-; RV32-NEXT:    vsrl.vv v16, v10, v16
-; RV32-NEXT:    vmulhu.vv v14, v16, v14
+; RV32-NEXT:    vsext.vf2 v18, v14
+; RV32-NEXT:    vsrl.vv v14, v10, v18
+; RV32-NEXT:    vmulhu.vv v14, v14, v16
 ; RV32-NEXT:    vsub.vv v10, v10, v14
 ; RV32-NEXT:    vmulhu.vv v10, v10, v12
 ; RV32-NEXT:    vadd.vv v10, v10, v14
-; RV32-NEXT:    lui a1, 2
-; RV32-NEXT:    addi a1, a1, 289
-; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v9, 3
-; RV32-NEXT:    vmerge.vim v9, v9, 2, v0
-; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vmerge.vim v8, v9, 1, v0
-; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV32-NEXT:    vsext.vf2 v12, v8
 ; RV32-NEXT:    vsrl.vv v8, v10, v12
 ; RV32-NEXT:    vse16.v v8, (a0)
@@ -3252,31 +3252,31 @@ define void @mulhu_v16i16(ptr %x) {
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v8, (a0)
 ; RV64-NEXT:    li a1, 257
-; RV64-NEXT:    vmv.s.x v0, a1
 ; RV64-NEXT:    vmv.v.i v10, 0
-; RV64-NEXT:    lui a1, 1048568
-; RV64-NEXT:    vmerge.vxm v10, v10, a1, v0
+; RV64-NEXT:    vmv.s.x v0, a1
 ; RV64-NEXT:    lui a1, %hi(.LCPI182_0)
 ; RV64-NEXT:    addi a1, a1, %lo(.LCPI182_0)
 ; RV64-NEXT:    vle16.v v12, (a1)
+; RV64-NEXT:    lui a1, 1048568
+; RV64-NEXT:    vmerge.vxm v10, v10, a1, v0
 ; RV64-NEXT:    li a1, 1
 ; RV64-NEXT:    slli a1, a1, 48
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v14, a1
+; RV64-NEXT:    lui a1, %hi(.LCPI182_1)
+; RV64-NEXT:    ld a1, %lo(.LCPI182_1)(a1)
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vsext.vf2 v16, v14
 ; RV64-NEXT:    vsrl.vv v14, v8, v16
 ; RV64-NEXT:    vmulhu.vv v12, v14, v12
-; RV64-NEXT:    lui a1, %hi(.LCPI182_1)
-; RV64-NEXT:    ld a1, %lo(.LCPI182_1)(a1)
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v14, a1
+; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vsub.vv v8, v8, v12
 ; RV64-NEXT:    vmulhu.vv v8, v8, v10
 ; RV64-NEXT:    vadd.vv v8, v8, v12
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a1
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vsext.vf2 v12, v10
-; RV64-NEXT:    vsrl.vv v8, v8, v12
+; RV64-NEXT:    vsext.vf2 v10, v14
+; RV64-NEXT:    vsrl.vv v8, v8, v10
 ; RV64-NEXT:    vse16.v v8, (a0)
 ; RV64-NEXT:    ret
   %a = load <16 x i16>, ptr %x
@@ -3291,22 +3291,22 @@ define void @mulhu_v8i32(ptr %x) {
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    li a1, 68
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    lui a1, %hi(.LCPI183_0)
 ; CHECK-NEXT:    addi a1, a1, %lo(.LCPI183_0)
-; CHECK-NEXT:    vle32.v v10, (a1)
-; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vle32.v v12, (a1)
 ; CHECK-NEXT:    lui a1, 524288
-; CHECK-NEXT:    vmerge.vxm v12, v12, a1, v0
-; CHECK-NEXT:    vmulhu.vv v10, v8, v10
-; CHECK-NEXT:    vsub.vv v8, v8, v10
-; CHECK-NEXT:    vmulhu.vv v8, v8, v12
-; CHECK-NEXT:    vadd.vv v8, v8, v10
+; CHECK-NEXT:    vmerge.vxm v10, v10, a1, v0
 ; CHECK-NEXT:    lui a1, 4128
 ; CHECK-NEXT:    addi a1, a1, 514
+; CHECK-NEXT:    vmulhu.vv v12, v8, v12
+; CHECK-NEXT:    vsub.vv v8, v8, v12
+; CHECK-NEXT:    vmulhu.vv v8, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v12
 ; CHECK-NEXT:    vsext.vf4 v12, v10
 ; CHECK-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-NEXT:    vse32.v v8, (a0)
@@ -3326,24 +3326,22 @@ define void @mulhu_v4i64(ptr %x) {
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI184_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v10, (a1)
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vmulhu.vv v10, v8, v10
 ; RV32-NEXT:    lui a1, 524288
-; RV32-NEXT:    vmv.s.x v12, a1
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.i v14, 0
-; RV32-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
-; RV32-NEXT:    vslideup.vi v14, v12, 5
+; RV32-NEXT:    vmv.v.i v12, 0
+; RV32-NEXT:    vmv.s.x v14, a1
 ; RV32-NEXT:    lui a1, %hi(.LCPI184_1)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI184_1)
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, tu, ma
+; RV32-NEXT:    vslideup.vi v12, v14, 5
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT:    vle8.v v12, (a1)
+; RV32-NEXT:    vle8.v v14, (a1)
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vmulhu.vv v10, v8, v10
 ; RV32-NEXT:    vsub.vv v8, v8, v10
-; RV32-NEXT:    vmulhu.vv v8, v8, v14
+; RV32-NEXT:    vmulhu.vv v8, v8, v12
 ; RV32-NEXT:    vadd.vv v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf4 v10, v12
+; RV32-NEXT:    vsext.vf4 v10, v14
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vv v8, v8, v10
 ; RV32-NEXT:    vse64.v v8, (a0)
@@ -3356,22 +3354,22 @@ define void @mulhu_v4i64(ptr %x) {
 ; RV64-NEXT:    lui a1, %hi(.LCPI184_0)
 ; RV64-NEXT:    addi a1, a1, %lo(.LCPI184_0)
 ; RV64-NEXT:    vle64.v v10, (a1)
-; RV64-NEXT:    vmulhu.vv v10, v8, v10
-; RV64-NEXT:    vsub.vv v8, v8, v10
 ; RV64-NEXT:    li a1, -1
+; RV64-NEXT:    vmv.v.i v12, 0
 ; RV64-NEXT:    slli a1, a1, 63
-; RV64-NEXT:    vmv.s.x v12, a1
-; RV64-NEXT:    vmv.v.i v14, 0
+; RV64-NEXT:    vmv.s.x v14, a1
+; RV64-NEXT:    lui a1, 12320
+; RV64-NEXT:    addi a1, a1, 513
 ; RV64-NEXT:    vsetivli zero, 3, e64, m2, tu, ma
-; RV64-NEXT:    vslideup.vi v14, v12, 2
+; RV64-NEXT:    vslideup.vi v12, v14, 2
+; RV64-NEXT:    vmv.s.x v14, a1
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmulhu.vv v8, v8, v14
+; RV64-NEXT:    vmulhu.vv v10, v8, v10
+; RV64-NEXT:    vsub.vv v8, v8, v10
+; RV64-NEXT:    vmulhu.vv v8, v8, v12
 ; RV64-NEXT:    vadd.vv v8, v8, v10
-; RV64-NEXT:    lui a1, 12320
-; RV64-NEXT:    addi a1, a1, 513
-; RV64-NEXT:    vmv.s.x v10, a1
-; RV64-NEXT:    vsext.vf8 v12, v10
-; RV64-NEXT:    vsrl.vv v8, v8, v12
+; RV64-NEXT:    vsext.vf8 v10, v14
+; RV64-NEXT:    vsrl.vv v8, v8, v10
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    ret
   %a = load <4 x i64>, ptr %x
@@ -3384,16 +3382,16 @@ define void @mulhs_v32i8(ptr %x) {
 ; CHECK-LABEL: mulhs_v32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    lui a2, 304453
+; CHECK-NEXT:    addi a2, a2, -1452
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a2
+; CHECK-NEXT:    li a2, -123
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    lui a1, 304453
-; CHECK-NEXT:    addi a1, a1, -1452
-; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 7
 ; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
-; CHECK-NEXT:    li a1, -123
-; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vmv.v.x v12, a2
 ; CHECK-NEXT:    li a1, 57
 ; CHECK-NEXT:    vmerge.vxm v12, v12, a1, v0
 ; CHECK-NEXT:    vmulhu.vv v8, v8, v12
@@ -3437,11 +3435,11 @@ define void @mulhs_v8i32(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
+; RV32-NEXT:    li a1, 85
+; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    lui a1, 419430
 ; RV32-NEXT:    addi a1, a1, 1639
 ; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    li a1, 85
-; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    lui a1, 629146
 ; RV32-NEXT:    addi a1, a1, -1639
 ; RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
@@ -3479,63 +3477,61 @@ define void @mulhs_v4i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a2, a1, 1365
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a2
 ; RV32-NEXT:    li a2, 17
 ; RV32-NEXT:    vmv.s.x v0, a2
-; RV32-NEXT:    addi a1, a1, 1366
-; RV32-NEXT:    vmerge.vxm v10, v10, a1, v0
-; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vmulh.vv v10, v8, v10
-; RV32-NEXT:    lui a1, 1048560
+; RV32-NEXT:    lui a2, 1048560
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v12, a1
+; RV32-NEXT:    vmv.v.x v10, a2
+; RV32-NEXT:    addi a2, a1, 1365
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf4 v14, v12
+; RV32-NEXT:    vmv.v.x v12, a2
+; RV32-NEXT:    li a2, 63
+; RV32-NEXT:    addi a1, a1, 1366
+; RV32-NEXT:    vmerge.vxm v12, v12, a1, v0
+; RV32-NEXT:    lui a1, 16
+; RV32-NEXT:    vsext.vf4 v14, v10
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vmulh.vv v10, v8, v12
 ; RV32-NEXT:    vmadd.vv v14, v8, v10
-; RV32-NEXT:    li a1, 63
-; RV32-NEXT:    vsrl.vx v8, v14, a1
-; RV32-NEXT:    lui a1, 16
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
+; RV32-NEXT:    vmv.v.x v8, a1
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vsext.vf4 v12, v10
+; RV32-NEXT:    vsext.vf4 v10, v8
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV32-NEXT:    vsra.vv v10, v14, v12
+; RV32-NEXT:    vsrl.vx v8, v14, a2
+; RV32-NEXT:    vsra.vv v10, v14, v10
 ; RV32-NEXT:    vadd.vv v8, v10, v8
 ; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mulhs_v4i64:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    lui a1, 349525
+; RV64-NEXT:    lui a2, 1044496
 ; RV64-NEXT:    addiw a1, a1, 1365
+; RV64-NEXT:    addi a2, a2, -256
+; RV64-NEXT:    vmv.s.x v10, a2
 ; RV64-NEXT:    slli a2, a1, 32
 ; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    lui a2, %hi(.LCPI188_0)
 ; RV64-NEXT:    ld a2, %lo(.LCPI188_0)(a2)
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vle64.v v8, (a0)
 ; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; RV64-NEXT:    vmv.v.i v0, 5
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a1
-; RV64-NEXT:    vmerge.vxm v10, v10, a2, v0
-; RV64-NEXT:    vmulh.vv v10, v8, v10
-; RV64-NEXT:    lui a1, 1044496
-; RV64-NEXT:    addi a1, a1, -256
-; RV64-NEXT:    vmv.s.x v12, a1
-; RV64-NEXT:    vsext.vf8 v14, v12
-; RV64-NEXT:    vmadd.vv v14, v8, v10
+; RV64-NEXT:    vmv.v.x v12, a1
 ; RV64-NEXT:    li a1, 63
+; RV64-NEXT:    vmerge.vxm v12, v12, a2, v0
+; RV64-NEXT:    lui a2, 4096
+; RV64-NEXT:    addi a2, a2, 256
+; RV64-NEXT:    vsext.vf8 v14, v10
+; RV64-NEXT:    vmulh.vv v10, v8, v12
+; RV64-NEXT:    vmadd.vv v14, v8, v10
+; RV64-NEXT:    vmv.s.x v8, a2
+; RV64-NEXT:    vsext.vf8 v10, v8
 ; RV64-NEXT:    vsrl.vx v8, v14, a1
-; RV64-NEXT:    lui a1, 4096
-; RV64-NEXT:    addi a1, a1, 256
-; RV64-NEXT:    vmv.s.x v10, a1
-; RV64-NEXT:    vsext.vf8 v12, v10
-; RV64-NEXT:    vsra.vv v10, v14, v12
+; RV64-NEXT:    vsra.vv v10, v14, v10
 ; RV64-NEXT:    vadd.vv v8, v10, v8
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    ret
@@ -5632,12 +5628,12 @@ define void @mulhs_vx_v2i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    lui a1, 349525
-; RV32-NEXT:    addi a2, a1, 1365
+; RV32-NEXT:    addi a2, sp, 8
+; RV32-NEXT:    addi a3, a1, 1365
 ; RV32-NEXT:    addi a1, a1, 1366
 ; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    sw a2, 12(sp)
-; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    vlse64.v v9, (a1), zero
+; RV32-NEXT:    sw a3, 12(sp)
+; RV32-NEXT:    vlse64.v v9, (a2), zero
 ; RV32-NEXT:    vmulh.vv v8, v8, v9
 ; RV32-NEXT:    li a1, 63
 ; RV32-NEXT:    vsrl.vx v9, v8, a1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
index 67c18b5eef736e..123e2243647953 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
@@ -9,9 +9,9 @@
 define void @vector_interleave_store_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b, ptr %p) {
 ; CHECK-LABEL: vector_interleave_store_v32i1_v16i1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v0, v8, 2
-; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
index af46849ae08719..30e41f2f526e57 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll
@@ -15,32 +15,37 @@ define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-NEXT:    ld a6, 56(a0)
 ; ZVE32X-NEXT:    ld a7, 72(a0)
 ; ZVE32X-NEXT:    ld a0, 80(a0)
+; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVE32X-NEXT:    vmv.s.x v9, zero
+; ZVE32X-NEXT:    vmv.v.i v10, 0
 ; ZVE32X-NEXT:    xor a3, a3, a4
+; ZVE32X-NEXT:    xor a1, a1, a2
+; ZVE32X-NEXT:    xor a2, a5, a6
+; ZVE32X-NEXT:    xor a0, a7, a0
 ; ZVE32X-NEXT:    snez a3, a3
-; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; ZVE32X-NEXT:    snez a1, a1
+; ZVE32X-NEXT:    snez a2, a2
+; ZVE32X-NEXT:    snez a0, a0
 ; ZVE32X-NEXT:    vmv.s.x v8, a3
+; ZVE32X-NEXT:    vmv.s.x v11, a1
+; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vand.vi v8, v8, 1
+; ZVE32X-NEXT:    vand.vi v11, v11, 1
 ; ZVE32X-NEXT:    vmsne.vi v0, v8, 0
-; ZVE32X-NEXT:    vmv.s.x v9, zero
-; ZVE32X-NEXT:    vmerge.vim v8, v9, 1, v0
-; ZVE32X-NEXT:    xor a1, a1, a2
-; ZVE32X-NEXT:    snez a1, a1
-; ZVE32X-NEXT:    vmv.s.x v10, a1
-; ZVE32X-NEXT:    vand.vi v10, v10, 1
-; ZVE32X-NEXT:    vmsne.vi v0, v10, 0
+; ZVE32X-NEXT:    vmsne.vi v8, v11, 0
+; ZVE32X-NEXT:    vmerge.vim v11, v9, 1, v0
+; ZVE32X-NEXT:    vmv1r.v v0, v8
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vmv.v.i v10, 0
-; ZVE32X-NEXT:    vmerge.vim v11, v10, 1, v0
+; ZVE32X-NEXT:    vmerge.vim v8, v10, 1, v0
 ; ZVE32X-NEXT:    vsetivli zero, 2, e8, mf4, tu, ma
-; ZVE32X-NEXT:    vslideup.vi v11, v8, 1
+; ZVE32X-NEXT:    vslideup.vi v8, v11, 1
+; ZVE32X-NEXT:    vmv.s.x v11, a2
+; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
+; ZVE32X-NEXT:    vand.vi v11, v11, 1
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
-; ZVE32X-NEXT:    xor a1, a5, a6
-; ZVE32X-NEXT:    snez a1, a1
-; ZVE32X-NEXT:    vmv.s.x v8, a1
+; ZVE32X-NEXT:    vmsne.vi v0, v8, 0
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vand.vi v8, v8, 1
-; ZVE32X-NEXT:    vmsne.vi v8, v8, 0
+; ZVE32X-NEXT:    vmsne.vi v8, v11, 0
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmerge.vim v11, v10, 1, v0
 ; ZVE32X-NEXT:    vmv1r.v v0, v8
@@ -48,13 +53,12 @@ define <4 x i1> @load_large_vector(ptr %p) {
 ; ZVE32X-NEXT:    vmerge.vim v8, v9, 1, v0
 ; ZVE32X-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
 ; ZVE32X-NEXT:    vslideup.vi v11, v8, 2
-; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
-; ZVE32X-NEXT:    xor a0, a7, a0
-; ZVE32X-NEXT:    snez a0, a0
 ; ZVE32X-NEXT:    vmv.s.x v8, a0
 ; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vand.vi v8, v8, 1
+; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; ZVE32X-NEXT:    vmsne.vi v0, v11, 0
+; ZVE32X-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmsne.vi v8, v8, 0
 ; ZVE32X-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVE32X-NEXT:    vmerge.vim v10, v10, 1, v0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index b56814ea4c372a..fa1377406d697b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -12,9 +12,9 @@ define {<3 x i32>, <3 x i32>} @load_factor2_v3(ptr %ptr) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v10, (a0)
+; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vnsrl.wi v8, v10, 0
-; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    vnsrl.wx v9, v10, a0
 ; RV32-NEXT:    ret
 ;
@@ -183,129 +183,107 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 84
+; RV32-NEXT:    li a3, 81
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd4, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 84 * vlenb
-; RV32-NEXT:    addi a3, a1, 256
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd1, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 81 * vlenb
+; RV32-NEXT:    addi a3, a1, 128
+; RV32-NEXT:    addi a4, a1, 256
 ; RV32-NEXT:    li a2, 32
+; RV32-NEXT:    lui a5, 12
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v8, (a3)
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 76
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a3, a1, 128
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vslideup.vi v4, v8, 4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v4, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a4, 12
-; RV32-NEXT:    vmv.s.x v0, a4
+; RV32-NEXT:    vle32.v v16, (a4)
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 16
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
+; RV32-NEXT:    li a6, 57
+; RV32-NEXT:    mul a4, a4, a6
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vslideup.vi v4, v8, 10, v0.t
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, %hi(.LCPI8_0)
 ; RV32-NEXT:    addi a4, a4, %lo(.LCPI8_0)
+; RV32-NEXT:    vmv.s.x v1, a5
+; RV32-NEXT:    lui a5, %hi(.LCPI8_1)
+; RV32-NEXT:    addi a5, a5, %lo(.LCPI8_1)
+; RV32-NEXT:    vle16.v v4, (a4)
+; RV32-NEXT:    lui a4, 1
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vslideup.vi v12, v16, 4
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    li a7, 37
+; RV32-NEXT:    mul a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs4r.v v12, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v16, v16, 16
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    li a7, 45
+; RV32-NEXT:    mul a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v16, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT:    vslideup.vi v12, v16, 10, v0.t
+; RV32-NEXT:    vmv.v.v v28, v12
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v0, (a4)
-; RV32-NEXT:    lui a4, %hi(.LCPI8_1)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI8_1)
-; RV32-NEXT:    lui a5, 1
-; RV32-NEXT:    vle16.v v8, (a4)
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a6, 56
-; RV32-NEXT:    mul a4, a4, a6
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vle16.v v24, (a5)
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 68
-; RV32-NEXT:    mul a1, a1, a4
+; RV32-NEXT:    slli a5, a1, 6
+; RV32-NEXT:    add a1, a5, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vle32.v v24, (a3)
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 60
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, a5, -64
-; RV32-NEXT:    vmv.s.x v16, a1
+; RV32-NEXT:    vle32.v v16, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 44
+; RV32-NEXT:    li a3, 73
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v16, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v0
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a1, a4, -64
+; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 44
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 5
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vrgatherei16.vv v16, v8, v4
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 73
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v4, v16
+; RV32-NEXT:    vmv.v.v v28, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 36
+; RV32-NEXT:    li a3, 41
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v28, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    li a3, 57
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vslideup.vi v12, v8, 2
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v1, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    li a3, 45
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslideup.vi v12, v16, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -318,7 +296,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vle16.v v12, (a1)
 ; RV32-NEXT:    vle16.v v8, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 28
+; RV32-NEXT:    li a3, 25
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -328,34 +306,34 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v2, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 68
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 6
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    vrgatherei16.vv v24, v16, v12
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 44
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 5
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 60
+; RV32-NEXT:    li a3, 73
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 28
+; RV32-NEXT:    li a3, 25
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v24, v8, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -363,13 +341,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v8, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    li a3, 57
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -378,15 +356,15 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vrgatherei16.vv v8, v24, v2
 ; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    li a3, 45
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslideup.vi v8, v24, 6, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 44
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 5
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
@@ -398,22 +376,18 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vle16.v v24, (a1)
 ; RV32-NEXT:    vle16.v v4, (a3)
 ; RV32-NEXT:    li a1, 960
-; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv.s.x v28, a1
 ; RV32-NEXT:    vrgatherei16.vv v8, v16, v24
+; RV32-NEXT:    vmv1r.v v0, v28
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 60
+; RV32-NEXT:    li a3, 73
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 28
+; RV32-NEXT:    li a3, 25
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -423,70 +397,78 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    lui a3, %hi(.LCPI8_8)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_8)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vle16.v v16, (a1)
+; RV32-NEXT:    vle16.v v8, (a1)
 ; RV32-NEXT:    lui a1, %hi(.LCPI8_9)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_9)
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v8, (a3)
+; RV32-NEXT:    vle16.v v12, (a3)
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 2
+; RV32-NEXT:    li a4, 13
+; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs4r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vle16.v v8, (a1)
+; RV32-NEXT:    vs4r.v v12, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vle16.v v12, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a3, a1, 2
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    li a3, 57
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v12, v8, v16
+; RV32-NEXT:    vrgatherei16.vv v20, v16, v8
 ; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    li a3, 45
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v12, v16, 4, v0.t
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv4r.v v24, v8
+; RV32-NEXT:    vslideup.vi v20, v8, 4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    li a3, 21
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 68
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 6
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v8, v0, v20
+; RV32-NEXT:    vrgatherei16.vv v8, v0, v16
+; RV32-NEXT:    vmv1r.v v0, v28
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    li a3, 73
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a3, a1, 2
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
+; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -497,21 +479,20 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    lui a1, 15
 ; RV32-NEXT:    vmv.s.x v3, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    li a3, 57
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v8, v24, 6
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vslideup.vi v8, v16, 6
 ; RV32-NEXT:    vmv1r.v v0, v3
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v12, v0.t
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v12, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 76
+; RV32-NEXT:    li a3, 57
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv4r.v v24, v16
 ; RV32-NEXT:    lui a1, %hi(.LCPI8_11)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_11)
 ; RV32-NEXT:    lui a3, %hi(.LCPI8_12)
@@ -527,21 +508,22 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 68
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 6
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v16, v28
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 60
+; RV32-NEXT:    li a3, 73
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a3, a1, 2
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -560,7 +542,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv1r.v v0, v3
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 40
+; RV32-NEXT:    li a3, 37
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -568,13 +550,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vrgatherei16.vv v16, v24, v8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 44
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 5
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 28
+; RV32-NEXT:    li a3, 25
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -582,8 +564,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v20, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 68
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 6
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
@@ -595,7 +577,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 60
+; RV32-NEXT:    li a2, 73
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -604,26 +586,28 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    li a2, 21
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v28, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    li a2, 13
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v28, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 76
+; RV32-NEXT:    li a2, 57
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a2, a1, 2
+; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
@@ -640,21 +624,21 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vse32.v v20, (a1)
 ; RV32-NEXT:    addi a1, a0, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 56
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 36
+; RV32-NEXT:    li a2, 41
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 84
+; RV32-NEXT:    li a1, 81
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
@@ -667,141 +651,130 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 6
-; RV64-NEXT:    add a2, a3, a2
+; RV64-NEXT:    slli a2, a2, 6
 ; RV64-NEXT:    sub sp, sp, a2
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc1, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 65 * vlenb
-; RV64-NEXT:    addi a2, a1, 256
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vle64.v v16, (a2)
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 21
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 64 * vlenb
 ; RV64-NEXT:    addi a2, a1, 128
+; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a1)
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 48
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a1, a1, 256
+; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 57
+; RV64-NEXT:    li a3, 20
 ; RV64-NEXT:    mul a1, a1, a3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    li a1, 128
+; RV64-NEXT:    vid.v v10
+; RV64-NEXT:    vmv.s.x v1, a1
+; RV64-NEXT:    li a1, 6
+; RV64-NEXT:    vmul.vx v2, v10, a1
+; RV64-NEXT:    li a1, 56
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vrgather.vi v12, v16, 4
-; RV64-NEXT:    li a1, 128
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v16, v16, 8
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a3, 37
-; RV64-NEXT:    mul a1, a1, a3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    li a4, 36
+; RV64-NEXT:    mul a3, a3, a4
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vrgather.vi v12, v16, 2, v0.t
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vid.v v10
-; RV64-NEXT:    li a1, 6
-; RV64-NEXT:    vmul.vx v8, v10, a1
-; RV64-NEXT:    li a1, 56
-; RV64-NEXT:    vle64.v v24, (a2)
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vle64.v v16, (a2)
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 45
+; RV64-NEXT:    li a3, 56
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv.s.x v10, a1
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs1r.v v10, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vadd.vi v10, v8, -16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv.s.x v7, a1
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT:    vadd.vi v10, v2, -16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    li a2, 48
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v16, v0, v8
-; RV64-NEXT:    vmv2r.v v4, v8
+; RV64-NEXT:    vrgatherei16.vv v24, v16, v2
+; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl1r.v v6, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv1r.v v0, v6
-; RV64-NEXT:    vrgatherei16.vv v16, v24, v10, v0.t
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v24, v16, v10, v0.t
 ; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v12, v16
+; RV64-NEXT:    vmv.v.v v12, v24
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 4
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    slli a1, a1, 4
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
+; RV64-NEXT:    li a2, 20
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v12, v8, 5
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vl1r.v v1, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vi v12, v16, 5
 ; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 37
+; RV64-NEXT:    li a2, 36
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vrgather.vi v12, v16, 3, v0.t
-; RV64-NEXT:    vmv.v.v v28, v12
+; RV64-NEXT:    vmv.v.v v20, v12
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v24, v4, 1
-; RV64-NEXT:    vadd.vi v26, v4, -15
+; RV64-NEXT:    vadd.vi v16, v2, 1
+; RV64-NEXT:    vadd.vi v18, v2, -15
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    li a2, 48
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v16, v8, v24
-; RV64-NEXT:    vmv1r.v v0, v6
+; RV64-NEXT:    vrgatherei16.vv v24, v8, v16
+; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v16, v8, v26, v0.t
+; RV64-NEXT:    vrgatherei16.vv v24, v8, v18, v0.t
 ; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v28, v16
+; RV64-NEXT:    vmv.v.v v20, v24
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 13
+; RV64-NEXT:    li a2, 12
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v28, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    lui a1, 16
 ; RV64-NEXT:    addi a1, a1, 7
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.i v9, 6
 ; RV64-NEXT:    vmv.v.x v10, a1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
+; RV64-NEXT:    li a2, 20
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
@@ -809,72 +782,66 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vrgatherei16.vv v12, v16, v9
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    li a2, 44
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vrgatherei16.vv v12, v16, v10
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv4r.v v8, v16
 ; RV64-NEXT:    vrgather.vi v12, v16, 2
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    slli a1, a1, 5
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vrgather.vi v12, v16, 3
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    li a2, 28
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a1, 24
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv.s.x v7, a1
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v16, v4, 2
-; RV64-NEXT:    vadd.vi v2, v4, -14
+; RV64-NEXT:    vadd.vi v10, v2, 2
+; RV64-NEXT:    vadd.vi v4, v2, -14
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    li a2, 48
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v16
+; RV64-NEXT:    vrgatherei16.vv v24, v16, v10
+; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v2, v0.t
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v24, v8, v4, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 20
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 37
+; RV64-NEXT:    li a2, 36
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    li a2, 44
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
@@ -882,194 +849,168 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vrgather.vi v28, v24, 4, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    li a2, 44
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v28, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv2r.v v8, v4
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v4, v4, 3
-; RV64-NEXT:    vadd.vi v6, v8, -13
-; RV64-NEXT:    vmv2r.v v2, v8
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vadd.vi v30, v2, 3
+; RV64-NEXT:    vadd.vi v28, v2, -13
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v4
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v6, v0.t
+; RV64-NEXT:    vrgatherei16.vv v8, v16, v30
+; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
+; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
+; RV64-NEXT:    addi a1, sp, 16
 ; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vmv1r.v v0, v1
+; RV64-NEXT:    vmv4r.v v16, v24
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 37
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v4, v16, 5, v0.t
+; RV64-NEXT:    vrgather.vi v4, v24, 5, v0.t
 ; RV64-NEXT:    lui a1, 96
 ; RV64-NEXT:    li a2, 192
-; RV64-NEXT:    vmv.s.x v1, a2
+; RV64-NEXT:    vmv.s.x v8, a2
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a1
-; RV64-NEXT:    vmv1r.v v0, v1
+; RV64-NEXT:    vmv.v.x v9, a1
+; RV64-NEXT:    vmv1r.v v0, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    slli a1, a1, 5
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
+; RV64-NEXT:    vrgatherei16.vv v12, v24, v9, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    slli a1, a1, 5
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a1, 28
-; RV64-NEXT:    vmv.s.x v0, a1
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv.s.x v1, a1
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v22, v2, 4
-; RV64-NEXT:    vadd.vi v20, v2, -12
+; RV64-NEXT:    vadd.vi v10, v2, 4
+; RV64-NEXT:    vadd.vi v12, v2, -12
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    li a2, 48
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v22
+; RV64-NEXT:    vrgatherei16.vv v16, v24, v10
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
+; RV64-NEXT:    vrgatherei16.vv v16, v24, v12, v0.t
 ; RV64-NEXT:    lui a1, 112
 ; RV64-NEXT:    addi a1, a1, 1
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v12, a1
-; RV64-NEXT:    vmv1r.v v0, v1
+; RV64-NEXT:    vmv.v.x v9, a1
+; RV64-NEXT:    vmv1r.v v0, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    li a2, 28
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 36
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v20, v16, v12, v0.t
+; RV64-NEXT:    vrgatherei16.vv v12, v24, v9, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    li a2, 28
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    li a2, 44
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a2, 20
+; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v12, v24
+; RV64-NEXT:    vmv.v.v v20, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 53
+; RV64-NEXT:    li a2, 44
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v12, v2, 5
+; RV64-NEXT:    vadd.vi v20, v2, 5
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 57
+; RV64-NEXT:    li a2, 48
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v24, v16, v12
+; RV64-NEXT:    vrgatherei16.vv v8, v24, v20
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v12, v2, -11
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    add a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vadd.vi v20, v2, -11
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 45
+; RV64-NEXT:    li a2, 56
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v24, v16, v12, v0.t
+; RV64-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
 ; RV64-NEXT:    vmv4r.v v12, v4
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 21
-; RV64-NEXT:    mul a1, a1, a2
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
+; RV64-NEXT:    addi a1, sp, 16
 ; RV64-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
 ; RV64-NEXT:    vmv.v.v v12, v0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    slli a1, a1, 5
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv.v.v v20, v8
+; RV64-NEXT:    vmv.v.v v20, v16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 29
+; RV64-NEXT:    li a2, 28
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv.v.v v8, v24
+; RV64-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv.v.v v16, v8
 ; RV64-NEXT:    addi a1, a0, 320
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v8, (a1)
+; RV64-NEXT:    vse64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
 ; RV64-NEXT:    vse64.v v20, (a1)
 ; RV64-NEXT:    addi a1, a0, 192
 ; RV64-NEXT:    vse64.v v12, (a1)
 ; RV64-NEXT:    addi a1, a0, 128
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 53
+; RV64-NEXT:    li a3, 44
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
@@ -1077,22 +1018,20 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    addi a1, a0, 64
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 13
+; RV64-NEXT:    li a3, 12
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 4
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    slli a1, a1, 4
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a1, a0, 6
-; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    slli a0, a0, 6
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
 ; RV64-NEXT:    addi sp, sp, 16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
index eb5da36116af37..f27614c93985f5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llrint.ll
@@ -88,13 +88,13 @@ define <2 x i64> @llrint_v2i64_v2f32(<2 x float> %x) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vfmv.f.s fa5, v9
-; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v8
+; RV64-NEXT:    fcvt.l.s a0, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-NEXT:    fcvt.l.s a1, fa5
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a1
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-NEXT:    ret
   %a = call <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float> %x)
   ret <2 x i64> %a
@@ -187,25 +187,23 @@ define <3 x i64> @llrint_v3i64_v3f32(<3 x float> %x) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vfmv.f.s fa5, v9
-; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v8
+; RV64-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-NEXT:    fcvt.l.s a0, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-NEXT:    fcvt.l.s a1, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v10
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a1
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-NEXT:    vfmv.f.s fa5, v9
+; RV64-NEXT:    vmv.v.x v10, a0
 ; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-NEXT:    vfmv.f.s fa5, v8
-; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-NEXT:    vslide1down.vx v8, v10, a1
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    fcvt.l.s a0, fa5
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %a = call <3 x i64> @llvm.llrint.v3i64.v3f32(<3 x float> %x)
   ret <3 x i64> %a
@@ -298,25 +296,23 @@ define <4 x i64> @llrint_v4i64_v4f32(<4 x float> %x) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vfmv.f.s fa5, v9
-; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v8
+; RV64-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-NEXT:    fcvt.l.s a0, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-NEXT:    fcvt.l.s a1, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v10
 ; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a1
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-NEXT:    vfmv.f.s fa5, v9
+; RV64-NEXT:    vmv.v.x v10, a0
 ; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-NEXT:    vfmv.f.s fa5, v8
-; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-NEXT:    vslide1down.vx v8, v10, a1
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    fcvt.l.s a0, fa5
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %a = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> %x)
   ret <4 x i64> %a
@@ -427,37 +423,37 @@ define <8 x i64> @llrint_v8i64_v8f32(<8 x float> %x) {
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64-NEXT:    vfmv.f.s fa5, v8
-; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vslidedown.vi v10, v8, 7
+; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.s a1, fa5
 ; RV64-NEXT:    vslidedown.vi v10, v8, 6
+; RV64-NEXT:    fcvt.l.s a1, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.s a2, fa5
 ; RV64-NEXT:    vslidedown.vi v10, v8, 5
+; RV64-NEXT:    fcvt.l.s a2, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.s a3, fa5
 ; RV64-NEXT:    vslidedown.vi v10, v8, 4
+; RV64-NEXT:    fcvt.l.s a3, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v10
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v9, v8, 3
+; RV64-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64-NEXT:    fcvt.l.s a4, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v9
+; RV64-NEXT:    fcvt.l.s a5, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v10
+; RV64-NEXT:    fcvt.l.s a6, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-NEXT:    sd a4, 32(sp)
 ; RV64-NEXT:    sd a3, 40(sp)
 ; RV64-NEXT:    sd a2, 48(sp)
 ; RV64-NEXT:    sd a1, 56(sp)
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 3
-; RV64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-NEXT:    fcvt.l.s a1, fa5
-; RV64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-NEXT:    vfmv.f.s fa5, v9
-; RV64-NEXT:    fcvt.l.s a2, fa5
-; RV64-NEXT:    vslidedown.vi v8, v8, 1
-; RV64-NEXT:    vfmv.f.s fa5, v8
-; RV64-NEXT:    fcvt.l.s a3, fa5
 ; RV64-NEXT:    sd a0, 0(sp)
-; RV64-NEXT:    sd a3, 8(sp)
-; RV64-NEXT:    sd a2, 16(sp)
-; RV64-NEXT:    sd a1, 24(sp)
+; RV64-NEXT:    sd a1, 8(sp)
+; RV64-NEXT:    sd a6, 16(sp)
+; RV64-NEXT:    sd a5, 24(sp)
 ; RV64-NEXT:    mv a0, sp
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
@@ -619,62 +615,62 @@ define <16 x i64> @llrint_v16i64_v16f32(<16 x float> %x) {
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-NEXT:    vse32.v v8, (a0)
 ; RV64-NEXT:    flw fa5, 124(sp)
+; RV64-NEXT:    vfmv.f.s fa4, v8
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v10, v8, 3
+; RV64-NEXT:    vslidedown.vi v11, v8, 2
 ; RV64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-NEXT:    sd a0, 248(sp)
 ; RV64-NEXT:    flw fa5, 120(sp)
-; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    sd a0, 240(sp)
+; RV64-NEXT:    vslidedown.vi v12, v8, 1
+; RV64-NEXT:    fcvt.l.s a0, fa4
+; RV64-NEXT:    vfmv.f.s fa4, v10
+; RV64-NEXT:    fcvt.l.s a1, fa5
+; RV64-NEXT:    sd a1, 240(sp)
 ; RV64-NEXT:    flw fa5, 116(sp)
-; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    sd a0, 232(sp)
+; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v14, v8, 7
+; RV64-NEXT:    fcvt.l.s a1, fa4
+; RV64-NEXT:    vfmv.f.s fa4, v11
+; RV64-NEXT:    fcvt.l.s a2, fa5
+; RV64-NEXT:    sd a2, 232(sp)
 ; RV64-NEXT:    flw fa5, 112(sp)
-; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    sd a0, 224(sp)
+; RV64-NEXT:    fcvt.l.s a2, fa4
+; RV64-NEXT:    vfmv.f.s fa4, v12
+; RV64-NEXT:    vslidedown.vi v10, v8, 6
+; RV64-NEXT:    fcvt.l.s a3, fa5
+; RV64-NEXT:    sd a3, 224(sp)
 ; RV64-NEXT:    flw fa5, 108(sp)
-; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    sd a0, 216(sp)
+; RV64-NEXT:    fcvt.l.s a3, fa4
+; RV64-NEXT:    vfmv.f.s fa4, v14
+; RV64-NEXT:    vslidedown.vi v12, v8, 5
+; RV64-NEXT:    fcvt.l.s a4, fa5
+; RV64-NEXT:    sd a4, 216(sp)
 ; RV64-NEXT:    flw fa5, 104(sp)
-; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    sd a0, 208(sp)
+; RV64-NEXT:    fcvt.l.s a4, fa4
+; RV64-NEXT:    vfmv.f.s fa4, v10
+; RV64-NEXT:    fcvt.l.s a5, fa4
+; RV64-NEXT:    fcvt.l.s a6, fa5
+; RV64-NEXT:    sd a6, 208(sp)
 ; RV64-NEXT:    flw fa5, 100(sp)
-; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    sd a0, 200(sp)
-; RV64-NEXT:    flw fa5, 96(sp)
-; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    sd a0, 192(sp)
+; RV64-NEXT:    vfmv.f.s fa4, v12
+; RV64-NEXT:    fcvt.l.s a6, fa4
+; RV64-NEXT:    vslidedown.vi v8, v8, 4
+; RV64-NEXT:    fcvt.l.s a7, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v8
-; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.s a1, fa5
-; RV64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.s a2, fa5
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.s a3, fa5
+; RV64-NEXT:    sd a7, 200(sp)
+; RV64-NEXT:    fcvt.l.s a7, fa5
+; RV64-NEXT:    flw fa5, 96(sp)
 ; RV64-NEXT:    sd a0, 128(sp)
 ; RV64-NEXT:    sd a3, 136(sp)
 ; RV64-NEXT:    sd a2, 144(sp)
 ; RV64-NEXT:    sd a1, 152(sp)
-; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 7
-; RV64-NEXT:    vfmv.f.s fa5, v10
+; RV64-NEXT:    sd a7, 160(sp)
+; RV64-NEXT:    sd a6, 168(sp)
+; RV64-NEXT:    sd a5, 176(sp)
+; RV64-NEXT:    sd a4, 184(sp)
 ; RV64-NEXT:    fcvt.l.s a0, fa5
-; RV64-NEXT:    vslidedown.vi v10, v8, 6
-; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.s a1, fa5
-; RV64-NEXT:    vslidedown.vi v10, v8, 5
-; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.s a2, fa5
-; RV64-NEXT:    vslidedown.vi v8, v8, 4
-; RV64-NEXT:    vfmv.f.s fa5, v8
-; RV64-NEXT:    fcvt.l.s a3, fa5
-; RV64-NEXT:    sd a3, 160(sp)
-; RV64-NEXT:    sd a2, 168(sp)
-; RV64-NEXT:    sd a1, 176(sp)
-; RV64-NEXT:    sd a0, 184(sp)
+; RV64-NEXT:    sd a0, 192(sp)
 ; RV64-NEXT:    addi a0, sp, 128
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
@@ -775,12 +771,12 @@ define <2 x i64> @llrint_v2i64_v2f64(<2 x double> %x) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-NEXT:    vfmv.f.s fa5, v9
-; RV64-NEXT:    fcvt.l.d a0, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v8
+; RV64-NEXT:    fcvt.l.d a0, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-NEXT:    fcvt.l.d a1, fa5
-; RV64-NEXT:    vmv.v.x v8, a1
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-NEXT:    ret
   %a = call <2 x i64> @llvm.llrint.v2i64.v2f64(<2 x double> %x)
   ret <2 x i64> %a
@@ -871,21 +867,22 @@ define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vfmv.f.s fa5, v10
-; RV64-NEXT:    fcvt.l.d a0, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v8
-; RV64-NEXT:    fcvt.l.d a1, fa5
-; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT:    vmv.v.x v10, a1
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v12, v8, 2
+; RV64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-NEXT:    fcvt.l.d a0, fa5
+; RV64-NEXT:    vfmv.f.s fa5, v10
+; RV64-NEXT:    fcvt.l.d a1, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v12
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a0
 ; RV64-NEXT:    fcvt.l.d a0, fa5
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-NEXT:    vfmv.f.s fa5, v8
+; RV64-NEXT:    vslide1down.vx v8, v10, a1
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    fcvt.l.d a0, fa5
-; RV64-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-NEXT:    ret
   %a = call <4 x i64> @llvm.llrint.v4i64.v4f64(<4 x double> %x)
   ret <4 x i64> %a
@@ -987,34 +984,34 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    fld fa5, 56(sp)
-; RV64-NEXT:    fcvt.l.d a0, fa5
-; RV64-NEXT:    sd a0, 120(sp)
-; RV64-NEXT:    fld fa5, 48(sp)
-; RV64-NEXT:    fcvt.l.d a0, fa5
-; RV64-NEXT:    sd a0, 112(sp)
-; RV64-NEXT:    fld fa5, 40(sp)
-; RV64-NEXT:    fcvt.l.d a0, fa5
-; RV64-NEXT:    sd a0, 104(sp)
-; RV64-NEXT:    fld fa5, 32(sp)
-; RV64-NEXT:    fcvt.l.d a0, fa5
-; RV64-NEXT:    sd a0, 96(sp)
-; RV64-NEXT:    vfmv.f.s fa5, v8
-; RV64-NEXT:    fcvt.l.d a0, fa5
+; RV64-NEXT:    vfmv.f.s fa4, v8
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vfmv.f.s fa5, v10
+; RV64-NEXT:    fcvt.l.d a0, fa4
 ; RV64-NEXT:    fcvt.l.d a1, fa5
+; RV64-NEXT:    sd a1, 120(sp)
+; RV64-NEXT:    fld fa5, 48(sp)
+; RV64-NEXT:    vfmv.f.s fa4, v10
 ; RV64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-NEXT:    vfmv.f.s fa5, v10
+; RV64-NEXT:    fcvt.l.d a1, fa4
 ; RV64-NEXT:    fcvt.l.d a2, fa5
+; RV64-NEXT:    sd a2, 112(sp)
+; RV64-NEXT:    fld fa5, 40(sp)
+; RV64-NEXT:    vfmv.f.s fa4, v10
+; RV64-NEXT:    fcvt.l.d a2, fa4
 ; RV64-NEXT:    vslidedown.vi v8, v8, 2
+; RV64-NEXT:    fcvt.l.d a3, fa5
 ; RV64-NEXT:    vfmv.f.s fa5, v8
+; RV64-NEXT:    sd a3, 104(sp)
 ; RV64-NEXT:    fcvt.l.d a3, fa5
+; RV64-NEXT:    fld fa5, 32(sp)
 ; RV64-NEXT:    sd a0, 64(sp)
 ; RV64-NEXT:    sd a1, 72(sp)
 ; RV64-NEXT:    sd a3, 80(sp)
 ; RV64-NEXT:    sd a2, 88(sp)
+; RV64-NEXT:    fcvt.l.d a0, fa5
+; RV64-NEXT:    sd a0, 96(sp)
 ; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
index 8f1e026d09c0a2..356bc5edd77a19 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load.ll
@@ -88,14 +88,14 @@ define <6 x i1> @load_v6i1(ptr %p) {
 ; RV32-NEXT:    lbu a0, 0(a0)
 ; RV32-NEXT:    srli a1, a0, 5
 ; RV32-NEXT:    slli a2, a0, 27
-; RV32-NEXT:    srli a2, a2, 31
 ; RV32-NEXT:    slli a3, a0, 28
-; RV32-NEXT:    srli a3, a3, 31
 ; RV32-NEXT:    slli a4, a0, 29
-; RV32-NEXT:    srli a4, a4, 31
 ; RV32-NEXT:    slli a5, a0, 30
-; RV32-NEXT:    srli a5, a5, 31
 ; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    srli a2, a2, 31
+; RV32-NEXT:    srli a3, a3, 31
+; RV32-NEXT:    srli a4, a4, 31
+; RV32-NEXT:    srli a5, a5, 31
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a0
 ; RV32-NEXT:    vslide1down.vx v8, v8, a5
@@ -113,14 +113,14 @@ define <6 x i1> @load_v6i1(ptr %p) {
 ; RV64-NEXT:    lbu a0, 0(a0)
 ; RV64-NEXT:    srli a1, a0, 5
 ; RV64-NEXT:    slli a2, a0, 59
-; RV64-NEXT:    srli a2, a2, 63
 ; RV64-NEXT:    slli a3, a0, 60
-; RV64-NEXT:    srli a3, a3, 63
 ; RV64-NEXT:    slli a4, a0, 61
-; RV64-NEXT:    srli a4, a4, 63
 ; RV64-NEXT:    slli a5, a0, 62
-; RV64-NEXT:    srli a5, a5, 63
 ; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    srli a2, a2, 63
+; RV64-NEXT:    srli a3, a3, 63
+; RV64-NEXT:    srli a4, a4, 63
+; RV64-NEXT:    srli a5, a5, 63
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
 ; RV64-NEXT:    vslide1down.vx v8, v8, a5

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
index 0e98fd1ab0f5dd..2f58e3dd2769f1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-lrint.ll
@@ -41,37 +41,37 @@ define <2 x iXLen> @lrint_v2f32(<2 x float> %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vfmv.f.s fa5, v9
-; RV32-NEXT:    fcvt.w.s a0, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v8
+; RV32-NEXT:    fcvt.w.s a0, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v9
 ; RV32-NEXT:    fcvt.w.s a1, fa5
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    vmv.v.x v8, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-i32-LABEL: lrint_v2f32:
 ; RV64-i32:       # %bb.0:
 ; RV64-i32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-i32-NEXT:    vfmv.f.s fa5, v9
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
+; RV64-i32-NEXT:    fcvt.l.s a0, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v9
 ; RV64-i32-NEXT:    fcvt.l.s a1, fa5
-; RV64-i32-NEXT:    vmv.v.x v8, a1
-; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-i32-NEXT:    vmv.v.x v8, a0
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-i32-NEXT:    ret
 ;
 ; RV64-i64-LABEL: lrint_v2f32:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; RV64-i64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-i64-NEXT:    vfmv.f.s fa5, v9
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
+; RV64-i64-NEXT:    fcvt.l.s a0, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-i64-NEXT:    fcvt.l.s a1, fa5
 ; RV64-i64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-i64-NEXT:    vmv.v.x v8, a1
-; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-i64-NEXT:    vmv.v.x v8, a0
+; RV64-i64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-i64-NEXT:    ret
   %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f32(<2 x float> %x)
   ret <2 x iXLen> %a
@@ -83,65 +83,63 @@ define <3 x iXLen> @lrint_v3f32(<3 x float> %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vfmv.f.s fa5, v9
-; RV32-NEXT:    fcvt.w.s a0, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    fcvt.w.s a1, fa5
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vslide1down.vx v9, v9, a0
 ; RV32-NEXT:    vslidedown.vi v10, v8, 2
+; RV32-NEXT:    vslidedown.vi v8, v8, 3
+; RV32-NEXT:    fcvt.w.s a0, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v9
+; RV32-NEXT:    fcvt.w.s a1, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v10
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vslide1down.vx v9, v9, a0
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    vfmv.f.s fa5, v8
+; RV32-NEXT:    vslide1down.vx v8, v9, a1
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vslide1down.vx v8, v9, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-i32-LABEL: lrint_v3f32:
 ; RV64-i32:       # %bb.0:
 ; RV64-i32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-i32-NEXT:    vfmv.f.s fa5, v9
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    fcvt.l.s a1, fa5
-; RV64-i32-NEXT:    vmv.v.x v9, a1
-; RV64-i32-NEXT:    vslide1down.vx v9, v9, a0
 ; RV64-i32-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-i32-NEXT:    fcvt.l.s a0, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v9
+; RV64-i32-NEXT:    fcvt.l.s a1, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v10
+; RV64-i32-NEXT:    vmv.v.x v9, a0
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vslide1down.vx v9, v9, a0
-; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
+; RV64-i32-NEXT:    vslide1down.vx v8, v9, a1
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vslide1down.vx v8, v9, a0
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    ret
 ;
 ; RV64-i64-LABEL: lrint_v3f32:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-i64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-i64-NEXT:    vfmv.f.s fa5, v9
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
+; RV64-i64-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-i64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-i64-NEXT:    fcvt.l.s a0, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-i64-NEXT:    fcvt.l.s a1, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v10
 ; RV64-i64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-i64-NEXT:    vmv.v.x v10, a1
-; RV64-i64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-i64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-i64-NEXT:    vfmv.f.s fa5, v9
+; RV64-i64-NEXT:    vmv.v.x v10, a0
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-i64-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-i64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-i64-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-i64-NEXT:    vslide1down.vx v8, v10, a1
+; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-i64-NEXT:    fcvt.l.s a0, fa5
+; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i64-NEXT:    ret
   %a = call <3 x iXLen> @llvm.lrint.v3iXLen.v3f32(<3 x float> %x)
   ret <3 x iXLen> %a
@@ -153,65 +151,63 @@ define <4 x iXLen> @lrint_v4f32(<4 x float> %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vfmv.f.s fa5, v9
-; RV32-NEXT:    fcvt.w.s a0, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    fcvt.w.s a1, fa5
-; RV32-NEXT:    vmv.v.x v9, a1
-; RV32-NEXT:    vslide1down.vx v9, v9, a0
 ; RV32-NEXT:    vslidedown.vi v10, v8, 2
+; RV32-NEXT:    vslidedown.vi v8, v8, 3
+; RV32-NEXT:    fcvt.w.s a0, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v9
+; RV32-NEXT:    fcvt.w.s a1, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v10
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vslide1down.vx v9, v9, a0
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    vfmv.f.s fa5, v8
+; RV32-NEXT:    vslide1down.vx v8, v9, a1
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vslide1down.vx v8, v9, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-i32-LABEL: lrint_v4f32:
 ; RV64-i32:       # %bb.0:
 ; RV64-i32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-i32-NEXT:    vfmv.f.s fa5, v9
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    fcvt.l.s a1, fa5
-; RV64-i32-NEXT:    vmv.v.x v9, a1
-; RV64-i32-NEXT:    vslide1down.vx v9, v9, a0
 ; RV64-i32-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-i32-NEXT:    fcvt.l.s a0, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v9
+; RV64-i32-NEXT:    fcvt.l.s a1, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v10
+; RV64-i32-NEXT:    vmv.v.x v9, a0
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vslide1down.vx v9, v9, a0
-; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
+; RV64-i32-NEXT:    vslide1down.vx v8, v9, a1
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vslide1down.vx v8, v9, a0
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    ret
 ;
 ; RV64-i64-LABEL: lrint_v4f32:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-i64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-i64-NEXT:    vfmv.f.s fa5, v9
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
+; RV64-i64-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-i64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-i64-NEXT:    fcvt.l.s a0, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-i64-NEXT:    fcvt.l.s a1, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v10
 ; RV64-i64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-i64-NEXT:    vmv.v.x v10, a1
-; RV64-i64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-i64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-i64-NEXT:    vfmv.f.s fa5, v9
+; RV64-i64-NEXT:    vmv.v.x v10, a0
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-i64-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-i64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-i64-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-i64-NEXT:    vslide1down.vx v8, v10, a1
+; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-i64-NEXT:    fcvt.l.s a0, fa5
+; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i64-NEXT:    ret
   %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f32(<4 x float> %x)
   ret <4 x iXLen> %a
@@ -223,82 +219,74 @@ define <8 x iXLen> @lrint_v8f32(<8 x float> %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.s a0, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v8
+; RV32-NEXT:    vslidedown.vi v11, v8, 2
+; RV32-NEXT:    vslidedown.vi v12, v8, 3
+; RV32-NEXT:    fcvt.w.s a0, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v10
 ; RV32-NEXT:    fcvt.w.s a1, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v11
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 2
-; RV32-NEXT:    vfmv.f.s fa5, v12
+; RV32-NEXT:    vmv.v.x v10, a0
 ; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v12, v8, 3
 ; RV32-NEXT:    vfmv.f.s fa5, v12
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV32-NEXT:    vslidedown.vi v12, v8, 4
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    fcvt.w.s a1, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v12
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV32-NEXT:    vslidedown.vi v12, v8, 5
-; RV32-NEXT:    vfmv.f.s fa5, v12
-; RV32-NEXT:    fcvt.w.s a0, fa5
 ; RV32-NEXT:    vslide1down.vx v10, v10, a0
+; RV32-NEXT:    fcvt.w.s a0, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 6
+; RV32-NEXT:    vslidedown.vi v8, v8, 7
+; RV32-NEXT:    vslide1down.vx v10, v10, a1
+; RV32-NEXT:    fcvt.w.s a1, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v12
-; RV32-NEXT:    fcvt.w.s a0, fa5
 ; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vslidedown.vi v8, v8, 7
+; RV32-NEXT:    fcvt.w.s a0, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v8
+; RV32-NEXT:    vslide1down.vx v8, v10, a1
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vslide1down.vx v8, v10, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-i32-LABEL: lrint_v8f32:
 ; RV64-i32:       # %bb.0:
 ; RV64-i32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
+; RV64-i32-NEXT:    vslidedown.vi v11, v8, 2
+; RV64-i32-NEXT:    vslidedown.vi v12, v8, 3
+; RV64-i32-NEXT:    fcvt.l.s a0, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v10
 ; RV64-i32-NEXT:    fcvt.l.s a1, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v11
 ; RV64-i32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-i32-NEXT:    vmv.v.x v10, a1
-; RV64-i32-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-i32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v12, v8, 2
-; RV64-i32-NEXT:    vfmv.f.s fa5, v12
+; RV64-i32-NEXT:    vmv.v.x v10, a0
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-i32-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-i32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v12, v8, 3
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v12
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-i32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-i32-NEXT:    vslidedown.vi v12, v8, 4
+; RV64-i32-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-i32-NEXT:    fcvt.l.s a1, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v12
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-i32-NEXT:    vslidedown.vi v12, v8, 5
-; RV64-i32-NEXT:    vfmv.f.s fa5, v12
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i32-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-i32-NEXT:    fcvt.l.s a0, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v12
 ; RV64-i32-NEXT:    vslidedown.vi v12, v8, 6
+; RV64-i32-NEXT:    vslidedown.vi v8, v8, 7
+; RV64-i32-NEXT:    vslide1down.vx v10, v10, a1
+; RV64-i32-NEXT:    fcvt.l.s a1, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v12
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i32-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-i32-NEXT:    vslidedown.vi v8, v8, 7
+; RV64-i32-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
+; RV64-i32-NEXT:    vslide1down.vx v8, v10, a1
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    ret
 ;
 ; RV64-i64-LABEL: lrint_v8f32:
@@ -314,37 +302,37 @@ define <8 x iXLen> @lrint_v8f32(<8 x float> %x) {
 ; RV64-i64-NEXT:    andi sp, sp, -64
 ; RV64-i64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vslidedown.vi v10, v8, 7
+; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.s a1, fa5
 ; RV64-i64-NEXT:    vslidedown.vi v10, v8, 6
+; RV64-i64-NEXT:    fcvt.l.s a1, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.s a2, fa5
 ; RV64-i64-NEXT:    vslidedown.vi v10, v8, 5
+; RV64-i64-NEXT:    fcvt.l.s a2, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.s a3, fa5
 ; RV64-i64-NEXT:    vslidedown.vi v10, v8, 4
+; RV64-i64-NEXT:    fcvt.l.s a3, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v10
+; RV64-i64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-i64-NEXT:    vslidedown.vi v9, v8, 3
+; RV64-i64-NEXT:    vslidedown.vi v10, v8, 2
+; RV64-i64-NEXT:    vslidedown.vi v8, v8, 1
 ; RV64-i64-NEXT:    fcvt.l.s a4, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v9
+; RV64-i64-NEXT:    fcvt.l.s a5, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v10
+; RV64-i64-NEXT:    fcvt.l.s a6, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v8
 ; RV64-i64-NEXT:    sd a4, 32(sp)
 ; RV64-i64-NEXT:    sd a3, 40(sp)
 ; RV64-i64-NEXT:    sd a2, 48(sp)
 ; RV64-i64-NEXT:    sd a1, 56(sp)
-; RV64-i64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v9, v8, 3
-; RV64-i64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-i64-NEXT:    fcvt.l.s a1, fa5
-; RV64-i64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-i64-NEXT:    vfmv.f.s fa5, v9
-; RV64-i64-NEXT:    fcvt.l.s a2, fa5
-; RV64-i64-NEXT:    vslidedown.vi v8, v8, 1
-; RV64-i64-NEXT:    vfmv.f.s fa5, v8
-; RV64-i64-NEXT:    fcvt.l.s a3, fa5
 ; RV64-i64-NEXT:    sd a0, 0(sp)
-; RV64-i64-NEXT:    sd a3, 8(sp)
-; RV64-i64-NEXT:    sd a2, 16(sp)
-; RV64-i64-NEXT:    sd a1, 24(sp)
+; RV64-i64-NEXT:    sd a1, 8(sp)
+; RV64-i64-NEXT:    sd a6, 16(sp)
+; RV64-i64-NEXT:    sd a5, 24(sp)
 ; RV64-i64-NEXT:    mv a0, sp
 ; RV64-i64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-i64-NEXT:    vle64.v v8, (a0)
@@ -378,62 +366,62 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) {
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    flw fa5, 60(sp)
+; RV32-NEXT:    vfmv.f.s fa4, v8
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v10, v8, 3
+; RV32-NEXT:    vslidedown.vi v11, v8, 2
 ; RV32-NEXT:    fcvt.w.s a0, fa5
 ; RV32-NEXT:    sw a0, 124(sp)
 ; RV32-NEXT:    flw fa5, 56(sp)
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    sw a0, 120(sp)
+; RV32-NEXT:    fcvt.w.s a0, fa4
+; RV32-NEXT:    vfmv.f.s fa4, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 1
+; RV32-NEXT:    fcvt.w.s a1, fa5
+; RV32-NEXT:    sw a1, 120(sp)
 ; RV32-NEXT:    flw fa5, 52(sp)
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    sw a0, 116(sp)
+; RV32-NEXT:    fcvt.w.s a1, fa4
+; RV32-NEXT:    vfmv.f.s fa4, v11
+; RV32-NEXT:    fcvt.w.s a2, fa4
+; RV32-NEXT:    fcvt.w.s a3, fa5
+; RV32-NEXT:    sw a3, 116(sp)
 ; RV32-NEXT:    flw fa5, 48(sp)
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    sw a0, 112(sp)
+; RV32-NEXT:    vfmv.f.s fa4, v10
+; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v10, v8, 7
+; RV32-NEXT:    fcvt.w.s a3, fa4
+; RV32-NEXT:    fcvt.w.s a4, fa5
+; RV32-NEXT:    sw a4, 112(sp)
 ; RV32-NEXT:    flw fa5, 44(sp)
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    sw a0, 108(sp)
+; RV32-NEXT:    vfmv.f.s fa4, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 6
+; RV32-NEXT:    fcvt.w.s a4, fa4
+; RV32-NEXT:    fcvt.w.s a5, fa5
+; RV32-NEXT:    sw a5, 108(sp)
 ; RV32-NEXT:    flw fa5, 40(sp)
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    sw a0, 104(sp)
+; RV32-NEXT:    vfmv.f.s fa4, v10
+; RV32-NEXT:    vslidedown.vi v10, v8, 5
+; RV32-NEXT:    fcvt.w.s a5, fa4
+; RV32-NEXT:    fcvt.w.s a6, fa5
+; RV32-NEXT:    sw a6, 104(sp)
 ; RV32-NEXT:    flw fa5, 36(sp)
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    sw a0, 100(sp)
-; RV32-NEXT:    flw fa5, 32(sp)
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    sw a0, 96(sp)
+; RV32-NEXT:    vfmv.f.s fa4, v10
+; RV32-NEXT:    fcvt.w.s a6, fa4
+; RV32-NEXT:    vslidedown.vi v8, v8, 4
+; RV32-NEXT:    fcvt.w.s a7, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.s a1, fa5
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.s a2, fa5
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.s a3, fa5
+; RV32-NEXT:    sw a7, 100(sp)
+; RV32-NEXT:    fcvt.w.s a7, fa5
+; RV32-NEXT:    flw fa5, 32(sp)
 ; RV32-NEXT:    sw a0, 64(sp)
 ; RV32-NEXT:    sw a3, 68(sp)
 ; RV32-NEXT:    sw a2, 72(sp)
 ; RV32-NEXT:    sw a1, 76(sp)
-; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 7
-; RV32-NEXT:    vfmv.f.s fa5, v10
+; RV32-NEXT:    sw a7, 80(sp)
+; RV32-NEXT:    sw a6, 84(sp)
+; RV32-NEXT:    sw a5, 88(sp)
+; RV32-NEXT:    sw a4, 92(sp)
 ; RV32-NEXT:    fcvt.w.s a0, fa5
-; RV32-NEXT:    vslidedown.vi v10, v8, 6
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.s a1, fa5
-; RV32-NEXT:    vslidedown.vi v10, v8, 5
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.s a2, fa5
-; RV32-NEXT:    vslidedown.vi v8, v8, 4
-; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    fcvt.w.s a3, fa5
-; RV32-NEXT:    sw a3, 80(sp)
-; RV32-NEXT:    sw a2, 84(sp)
-; RV32-NEXT:    sw a1, 88(sp)
-; RV32-NEXT:    sw a0, 92(sp)
+; RV32-NEXT:    sw a0, 96(sp)
 ; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
@@ -462,62 +450,62 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) {
 ; RV64-i32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-i32-NEXT:    vse32.v v8, (a0)
 ; RV64-i32-NEXT:    flw fa5, 60(sp)
+; RV64-i32-NEXT:    vfmv.f.s fa4, v8
+; RV64-i32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-i32-NEXT:    vslidedown.vi v10, v8, 3
+; RV64-i32-NEXT:    vslidedown.vi v11, v8, 2
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i32-NEXT:    sw a0, 124(sp)
 ; RV64-i32-NEXT:    flw fa5, 56(sp)
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    sw a0, 120(sp)
+; RV64-i32-NEXT:    fcvt.l.s a0, fa4
+; RV64-i32-NEXT:    vfmv.f.s fa4, v10
+; RV64-i32-NEXT:    vslidedown.vi v10, v8, 1
+; RV64-i32-NEXT:    fcvt.l.s a1, fa5
+; RV64-i32-NEXT:    sw a1, 120(sp)
 ; RV64-i32-NEXT:    flw fa5, 52(sp)
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    sw a0, 116(sp)
+; RV64-i32-NEXT:    fcvt.l.s a1, fa4
+; RV64-i32-NEXT:    vfmv.f.s fa4, v11
+; RV64-i32-NEXT:    fcvt.l.s a2, fa4
+; RV64-i32-NEXT:    fcvt.l.s a3, fa5
+; RV64-i32-NEXT:    sw a3, 116(sp)
 ; RV64-i32-NEXT:    flw fa5, 48(sp)
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    sw a0, 112(sp)
+; RV64-i32-NEXT:    vfmv.f.s fa4, v10
+; RV64-i32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-i32-NEXT:    vslidedown.vi v10, v8, 7
+; RV64-i32-NEXT:    fcvt.l.s a3, fa4
+; RV64-i32-NEXT:    fcvt.l.s a4, fa5
+; RV64-i32-NEXT:    sw a4, 112(sp)
 ; RV64-i32-NEXT:    flw fa5, 44(sp)
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    sw a0, 108(sp)
+; RV64-i32-NEXT:    vfmv.f.s fa4, v10
+; RV64-i32-NEXT:    vslidedown.vi v10, v8, 6
+; RV64-i32-NEXT:    fcvt.l.s a4, fa4
+; RV64-i32-NEXT:    fcvt.l.s a5, fa5
+; RV64-i32-NEXT:    sw a5, 108(sp)
 ; RV64-i32-NEXT:    flw fa5, 40(sp)
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    sw a0, 104(sp)
+; RV64-i32-NEXT:    vfmv.f.s fa4, v10
+; RV64-i32-NEXT:    vslidedown.vi v10, v8, 5
+; RV64-i32-NEXT:    fcvt.l.s a5, fa4
+; RV64-i32-NEXT:    fcvt.l.s a6, fa5
+; RV64-i32-NEXT:    sw a6, 104(sp)
 ; RV64-i32-NEXT:    flw fa5, 36(sp)
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    sw a0, 100(sp)
-; RV64-i32-NEXT:    flw fa5, 32(sp)
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    sw a0, 96(sp)
+; RV64-i32-NEXT:    vfmv.f.s fa4, v10
+; RV64-i32-NEXT:    fcvt.l.s a6, fa4
+; RV64-i32-NEXT:    vslidedown.vi v8, v8, 4
+; RV64-i32-NEXT:    fcvt.l.s a7, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.s a1, fa5
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.s a2, fa5
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.s a3, fa5
+; RV64-i32-NEXT:    sw a7, 100(sp)
+; RV64-i32-NEXT:    fcvt.l.s a7, fa5
+; RV64-i32-NEXT:    flw fa5, 32(sp)
 ; RV64-i32-NEXT:    sw a0, 64(sp)
 ; RV64-i32-NEXT:    sw a3, 68(sp)
 ; RV64-i32-NEXT:    sw a2, 72(sp)
 ; RV64-i32-NEXT:    sw a1, 76(sp)
-; RV64-i32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 7
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
+; RV64-i32-NEXT:    sw a7, 80(sp)
+; RV64-i32-NEXT:    sw a6, 84(sp)
+; RV64-i32-NEXT:    sw a5, 88(sp)
+; RV64-i32-NEXT:    sw a4, 92(sp)
 ; RV64-i32-NEXT:    fcvt.l.s a0, fa5
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 6
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.s a1, fa5
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 5
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.s a2, fa5
-; RV64-i32-NEXT:    vslidedown.vi v8, v8, 4
-; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    fcvt.l.s a3, fa5
-; RV64-i32-NEXT:    sw a3, 80(sp)
-; RV64-i32-NEXT:    sw a2, 84(sp)
-; RV64-i32-NEXT:    sw a1, 88(sp)
-; RV64-i32-NEXT:    sw a0, 92(sp)
+; RV64-i32-NEXT:    sw a0, 96(sp)
 ; RV64-i32-NEXT:    addi a0, sp, 64
 ; RV64-i32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-i32-NEXT:    vle32.v v8, (a0)
@@ -546,62 +534,62 @@ define <16 x iXLen> @lrint_v16f32(<16 x float> %x) {
 ; RV64-i64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-i64-NEXT:    vse32.v v8, (a0)
 ; RV64-i64-NEXT:    flw fa5, 124(sp)
+; RV64-i64-NEXT:    vfmv.f.s fa4, v8
+; RV64-i64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-i64-NEXT:    vslidedown.vi v10, v8, 3
+; RV64-i64-NEXT:    vslidedown.vi v11, v8, 2
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa5
 ; RV64-i64-NEXT:    sd a0, 248(sp)
 ; RV64-i64-NEXT:    flw fa5, 120(sp)
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    sd a0, 240(sp)
+; RV64-i64-NEXT:    vslidedown.vi v12, v8, 1
+; RV64-i64-NEXT:    fcvt.l.s a0, fa4
+; RV64-i64-NEXT:    vfmv.f.s fa4, v10
+; RV64-i64-NEXT:    fcvt.l.s a1, fa5
+; RV64-i64-NEXT:    sd a1, 240(sp)
 ; RV64-i64-NEXT:    flw fa5, 116(sp)
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    sd a0, 232(sp)
+; RV64-i64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; RV64-i64-NEXT:    vslidedown.vi v14, v8, 7
+; RV64-i64-NEXT:    fcvt.l.s a1, fa4
+; RV64-i64-NEXT:    vfmv.f.s fa4, v11
+; RV64-i64-NEXT:    fcvt.l.s a2, fa5
+; RV64-i64-NEXT:    sd a2, 232(sp)
 ; RV64-i64-NEXT:    flw fa5, 112(sp)
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    sd a0, 224(sp)
+; RV64-i64-NEXT:    fcvt.l.s a2, fa4
+; RV64-i64-NEXT:    vfmv.f.s fa4, v12
+; RV64-i64-NEXT:    vslidedown.vi v10, v8, 6
+; RV64-i64-NEXT:    fcvt.l.s a3, fa5
+; RV64-i64-NEXT:    sd a3, 224(sp)
 ; RV64-i64-NEXT:    flw fa5, 108(sp)
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    sd a0, 216(sp)
+; RV64-i64-NEXT:    fcvt.l.s a3, fa4
+; RV64-i64-NEXT:    vfmv.f.s fa4, v14
+; RV64-i64-NEXT:    vslidedown.vi v12, v8, 5
+; RV64-i64-NEXT:    fcvt.l.s a4, fa5
+; RV64-i64-NEXT:    sd a4, 216(sp)
 ; RV64-i64-NEXT:    flw fa5, 104(sp)
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    sd a0, 208(sp)
+; RV64-i64-NEXT:    fcvt.l.s a4, fa4
+; RV64-i64-NEXT:    vfmv.f.s fa4, v10
+; RV64-i64-NEXT:    fcvt.l.s a5, fa4
+; RV64-i64-NEXT:    fcvt.l.s a6, fa5
+; RV64-i64-NEXT:    sd a6, 208(sp)
 ; RV64-i64-NEXT:    flw fa5, 100(sp)
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    sd a0, 200(sp)
-; RV64-i64-NEXT:    flw fa5, 96(sp)
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    sd a0, 192(sp)
+; RV64-i64-NEXT:    vfmv.f.s fa4, v12
+; RV64-i64-NEXT:    fcvt.l.s a6, fa4
+; RV64-i64-NEXT:    vslidedown.vi v8, v8, 4
+; RV64-i64-NEXT:    fcvt.l.s a7, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
-; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.s a1, fa5
-; RV64-i64-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.s a2, fa5
-; RV64-i64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.s a3, fa5
+; RV64-i64-NEXT:    sd a7, 200(sp)
+; RV64-i64-NEXT:    fcvt.l.s a7, fa5
+; RV64-i64-NEXT:    flw fa5, 96(sp)
 ; RV64-i64-NEXT:    sd a0, 128(sp)
 ; RV64-i64-NEXT:    sd a3, 136(sp)
 ; RV64-i64-NEXT:    sd a2, 144(sp)
 ; RV64-i64-NEXT:    sd a1, 152(sp)
-; RV64-i64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; RV64-i64-NEXT:    vslidedown.vi v10, v8, 7
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
+; RV64-i64-NEXT:    sd a7, 160(sp)
+; RV64-i64-NEXT:    sd a6, 168(sp)
+; RV64-i64-NEXT:    sd a5, 176(sp)
+; RV64-i64-NEXT:    sd a4, 184(sp)
 ; RV64-i64-NEXT:    fcvt.l.s a0, fa5
-; RV64-i64-NEXT:    vslidedown.vi v10, v8, 6
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.s a1, fa5
-; RV64-i64-NEXT:    vslidedown.vi v10, v8, 5
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.s a2, fa5
-; RV64-i64-NEXT:    vslidedown.vi v8, v8, 4
-; RV64-i64-NEXT:    vfmv.f.s fa5, v8
-; RV64-i64-NEXT:    fcvt.l.s a3, fa5
-; RV64-i64-NEXT:    sd a3, 160(sp)
-; RV64-i64-NEXT:    sd a2, 168(sp)
-; RV64-i64-NEXT:    sd a1, 176(sp)
-; RV64-i64-NEXT:    sd a0, 184(sp)
+; RV64-i64-NEXT:    sd a0, 192(sp)
 ; RV64-i64-NEXT:    addi a0, sp, 128
 ; RV64-i64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-i64-NEXT:    vle64.v v8, (a0)
@@ -653,38 +641,38 @@ define <2 x iXLen> @lrint_v2f64(<2 x double> %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v9, v8, 1
-; RV32-NEXT:    vfmv.f.s fa5, v9
-; RV32-NEXT:    fcvt.w.d a0, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v8
+; RV32-NEXT:    fcvt.w.d a0, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v9
 ; RV32-NEXT:    fcvt.w.d a1, fa5
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    vmv.v.x v8, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-i32-LABEL: lrint_v2f64:
 ; RV64-i32:       # %bb.0:
 ; RV64-i32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-i32-NEXT:    vfmv.f.s fa5, v9
-; RV64-i32-NEXT:    fcvt.l.d a0, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
+; RV64-i32-NEXT:    fcvt.l.d a0, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v9
 ; RV64-i32-NEXT:    fcvt.l.d a1, fa5
 ; RV64-i32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV64-i32-NEXT:    vmv.v.x v8, a1
-; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-i32-NEXT:    vmv.v.x v8, a0
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-i32-NEXT:    ret
 ;
 ; RV64-i64-LABEL: lrint_v2f64:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-i64-NEXT:    vslidedown.vi v9, v8, 1
-; RV64-i64-NEXT:    vfmv.f.s fa5, v9
-; RV64-i64-NEXT:    fcvt.l.d a0, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
+; RV64-i64-NEXT:    fcvt.l.d a0, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v9
 ; RV64-i64-NEXT:    fcvt.l.d a1, fa5
-; RV64-i64-NEXT:    vmv.v.x v8, a1
-; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-i64-NEXT:    vmv.v.x v8, a0
+; RV64-i64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-i64-NEXT:    ret
   %a = call <2 x iXLen> @llvm.lrint.v2iXLen.v2f64(<2 x double> %x)
   ret <2 x iXLen> %a
@@ -696,71 +684,70 @@ define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.d a0, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    fcvt.w.d a1, fa5
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v12, v8, 2
+; RV32-NEXT:    vslidedown.vi v8, v8, 3
+; RV32-NEXT:    fcvt.w.d a0, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v10
+; RV32-NEXT:    fcvt.w.d a1, fa5
 ; RV32-NEXT:    vfmv.f.s fa5, v12
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    fcvt.w.d a0, fa5
-; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    fcvt.w.d a0, fa5
 ; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32-NEXT:    vslide1down.vx v8, v10, a0
+; RV32-NEXT:    vslide1down.vx v8, v9, a1
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    fcvt.w.d a0, fa5
+; RV32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-i32-LABEL: lrint_v4f64:
 ; RV64-i32:       # %bb.0:
 ; RV64-i32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.d a0, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    fcvt.l.d a1, fa5
-; RV64-i32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-i32-NEXT:    vmv.v.x v10, a1
-; RV64-i32-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-i32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64-i32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v12, v8, 2
+; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-i32-NEXT:    fcvt.l.d a0, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v10
+; RV64-i32-NEXT:    fcvt.l.d a1, fa5
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v12
+; RV64-i32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-i32-NEXT:    vmv.v.x v9, a0
 ; RV64-i32-NEXT:    fcvt.l.d a0, fa5
-; RV64-i32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-i32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-i32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    fcvt.l.d a0, fa5
 ; RV64-i32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64-i32-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-i32-NEXT:    vslide1down.vx v8, v9, a1
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-i32-NEXT:    fcvt.l.d a0, fa5
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i32-NEXT:    ret
 ;
 ; RV64-i64-LABEL: lrint_v4f64:
 ; RV64-i64:       # %bb.0:
 ; RV64-i64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-i64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
-; RV64-i64-NEXT:    fcvt.l.d a0, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
-; RV64-i64-NEXT:    fcvt.l.d a1, fa5
-; RV64-i64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64-i64-NEXT:    vmv.v.x v10, a1
-; RV64-i64-NEXT:    vslide1down.vx v10, v10, a0
+; RV64-i64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-i64-NEXT:    vslidedown.vi v12, v8, 2
+; RV64-i64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-i64-NEXT:    fcvt.l.d a0, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa5, v10
+; RV64-i64-NEXT:    fcvt.l.d a1, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v12
+; RV64-i64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-i64-NEXT:    vmv.v.x v10, a0
 ; RV64-i64-NEXT:    fcvt.l.d a0, fa5
-; RV64-i64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-i64-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
+; RV64-i64-NEXT:    vslide1down.vx v8, v10, a1
+; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i64-NEXT:    fcvt.l.d a0, fa5
-; RV64-i64-NEXT:    vslide1down.vx v8, v10, a0
+; RV64-i64-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64-i64-NEXT:    ret
   %a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4f64(<4 x double> %x)
   ret <4 x iXLen> %a
@@ -780,21 +767,22 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV32-NEXT:    .cfi_def_cfa s0, 0
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    mv a0, sp
-; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV32-NEXT:    vse64.v v8, (a0)
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.d a0, fa5
+; RV32-NEXT:    vslidedown.vi v12, v8, 1
 ; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    fcvt.w.d a1, fa5
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
-; RV32-NEXT:    vfmv.f.s fa5, v10
-; RV32-NEXT:    fcvt.w.d a2, fa5
+; RV32-NEXT:    vslidedown.vi v14, v8, 2
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vse64.v v8, (a0)
+; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 3
-; RV32-NEXT:    vfmv.f.s fa5, v8
-; RV32-NEXT:    fcvt.w.d a3, fa5
+; RV32-NEXT:    vfmv.f.s fa4, v12
+; RV32-NEXT:    fcvt.w.d a0, fa5
+; RV32-NEXT:    vfmv.f.s fa5, v14
+; RV32-NEXT:    vfmv.f.s fa3, v8
+; RV32-NEXT:    fcvt.w.d a1, fa4
+; RV32-NEXT:    fcvt.w.d a2, fa5
+; RV32-NEXT:    fcvt.w.d a3, fa3
 ; RV32-NEXT:    fld fa5, 32(sp)
 ; RV32-NEXT:    fld fa4, 40(sp)
 ; RV32-NEXT:    fld fa3, 48(sp)
@@ -803,8 +791,8 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV32-NEXT:    fcvt.w.d a5, fa4
 ; RV32-NEXT:    fcvt.w.d a6, fa3
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    vmv.v.x v8, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32-NEXT:    vslide1down.vx v8, v8, a4
@@ -834,21 +822,22 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV64-i32-NEXT:    .cfi_def_cfa s0, 0
 ; RV64-i32-NEXT:    andi sp, sp, -64
 ; RV64-i32-NEXT:    mv a0, sp
-; RV64-i32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-i32-NEXT:    vse64.v v8, (a0)
 ; RV64-i32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.d a0, fa5
+; RV64-i32-NEXT:    vslidedown.vi v12, v8, 1
 ; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    fcvt.l.d a1, fa5
 ; RV64-i32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
-; RV64-i32-NEXT:    vslidedown.vi v10, v8, 2
-; RV64-i32-NEXT:    vfmv.f.s fa5, v10
-; RV64-i32-NEXT:    fcvt.l.d a2, fa5
+; RV64-i32-NEXT:    vslidedown.vi v14, v8, 2
+; RV64-i32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64-i32-NEXT:    vse64.v v8, (a0)
+; RV64-i32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-i32-NEXT:    vslidedown.vi v8, v8, 3
-; RV64-i32-NEXT:    vfmv.f.s fa5, v8
-; RV64-i32-NEXT:    fcvt.l.d a3, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa4, v12
+; RV64-i32-NEXT:    fcvt.l.d a0, fa5
+; RV64-i32-NEXT:    vfmv.f.s fa5, v14
+; RV64-i32-NEXT:    vfmv.f.s fa3, v8
+; RV64-i32-NEXT:    fcvt.l.d a1, fa4
+; RV64-i32-NEXT:    fcvt.l.d a2, fa5
+; RV64-i32-NEXT:    fcvt.l.d a3, fa3
 ; RV64-i32-NEXT:    fld fa5, 32(sp)
 ; RV64-i32-NEXT:    fld fa4, 40(sp)
 ; RV64-i32-NEXT:    fld fa3, 48(sp)
@@ -857,8 +846,8 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV64-i32-NEXT:    fcvt.l.d a5, fa4
 ; RV64-i32-NEXT:    fcvt.l.d a6, fa3
 ; RV64-i32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; RV64-i32-NEXT:    vmv.v.x v8, a1
-; RV64-i32-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-i32-NEXT:    vmv.v.x v8, a0
+; RV64-i32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-i32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64-i32-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64-i32-NEXT:    vslide1down.vx v8, v8, a4
@@ -891,34 +880,34 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
 ; RV64-i64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-i64-NEXT:    vse64.v v8, (a0)
 ; RV64-i64-NEXT:    fld fa5, 56(sp)
-; RV64-i64-NEXT:    fcvt.l.d a0, fa5
-; RV64-i64-NEXT:    sd a0, 120(sp)
-; RV64-i64-NEXT:    fld fa5, 48(sp)
-; RV64-i64-NEXT:    fcvt.l.d a0, fa5
-; RV64-i64-NEXT:    sd a0, 112(sp)
-; RV64-i64-NEXT:    fld fa5, 40(sp)
-; RV64-i64-NEXT:    fcvt.l.d a0, fa5
-; RV64-i64-NEXT:    sd a0, 104(sp)
-; RV64-i64-NEXT:    fld fa5, 32(sp)
-; RV64-i64-NEXT:    fcvt.l.d a0, fa5
-; RV64-i64-NEXT:    sd a0, 96(sp)
-; RV64-i64-NEXT:    vfmv.f.s fa5, v8
-; RV64-i64-NEXT:    fcvt.l.d a0, fa5
+; RV64-i64-NEXT:    vfmv.f.s fa4, v8
 ; RV64-i64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-i64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
+; RV64-i64-NEXT:    fcvt.l.d a0, fa4
 ; RV64-i64-NEXT:    fcvt.l.d a1, fa5
+; RV64-i64-NEXT:    sd a1, 120(sp)
+; RV64-i64-NEXT:    fld fa5, 48(sp)
+; RV64-i64-NEXT:    vfmv.f.s fa4, v10
 ; RV64-i64-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64-i64-NEXT:    vslidedown.vi v10, v8, 3
-; RV64-i64-NEXT:    vfmv.f.s fa5, v10
+; RV64-i64-NEXT:    fcvt.l.d a1, fa4
 ; RV64-i64-NEXT:    fcvt.l.d a2, fa5
+; RV64-i64-NEXT:    sd a2, 112(sp)
+; RV64-i64-NEXT:    fld fa5, 40(sp)
+; RV64-i64-NEXT:    vfmv.f.s fa4, v10
+; RV64-i64-NEXT:    fcvt.l.d a2, fa4
 ; RV64-i64-NEXT:    vslidedown.vi v8, v8, 2
+; RV64-i64-NEXT:    fcvt.l.d a3, fa5
 ; RV64-i64-NEXT:    vfmv.f.s fa5, v8
+; RV64-i64-NEXT:    sd a3, 104(sp)
 ; RV64-i64-NEXT:    fcvt.l.d a3, fa5
+; RV64-i64-NEXT:    fld fa5, 32(sp)
 ; RV64-i64-NEXT:    sd a0, 64(sp)
 ; RV64-i64-NEXT:    sd a1, 72(sp)
 ; RV64-i64-NEXT:    sd a3, 80(sp)
 ; RV64-i64-NEXT:    sd a2, 88(sp)
+; RV64-i64-NEXT:    fcvt.l.d a0, fa5
+; RV64-i64-NEXT:    sd a0, 96(sp)
 ; RV64-i64-NEXT:    addi a0, sp, 64
 ; RV64-i64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-i64-NEXT:    vle64.v v8, (a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
index 5b9af1a3cfe233..c29ccd45528b81 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll
@@ -282,11 +282,11 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %w) {
 ; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vslide1down.vx v9, v8, a0
 ; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    vslide1down.vx v9, v9, a0
-; CHECK-NEXT:    vslide1down.vx v9, v9, a1
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
 ; CHECK-NEXT:    vslide1down.vx v8, v8, zero
-; CHECK-NEXT:    vmv.v.i v0, 15
+; CHECK-NEXT:    vslide1down.vx v9, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 1
@@ -299,11 +299,11 @@ define <8 x i1> @buildvec_mask_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %w) {
 ; ZVE32F-NEXT:    vmv.v.x v8, a0
 ; ZVE32F-NEXT:    vslide1down.vx v9, v8, a0
 ; ZVE32F-NEXT:    li a0, 1
-; ZVE32F-NEXT:    vslide1down.vx v9, v9, a0
-; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
+; ZVE32F-NEXT:    vmv.v.i v0, 15
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; ZVE32F-NEXT:    vslide1down.vx v9, v9, a0
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, zero
-; ZVE32F-NEXT:    vmv.v.i v0, 15
+; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; ZVE32F-NEXT:    vand.vi v8, v8, 1
@@ -327,11 +327,11 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %
 ; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vslide1down.vx v9, v8, a0
 ; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    vslide1down.vx v9, v9, a0
-; CHECK-NEXT:    vslide1down.vx v9, v9, a1
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a3
+; CHECK-NEXT:    vslide1down.vx v9, v9, a0
 ; CHECK-NEXT:    vslide1down.vx v8, v8, zero
-; CHECK-NEXT:    vmv.v.i v0, 15
+; CHECK-NEXT:    vslide1down.vx v9, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a2
 ; CHECK-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 1
@@ -344,11 +344,11 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1_2(i1 %x, i1 %y, i1 %z, i1 %
 ; ZVE32F-NEXT:    vmv.v.x v8, a0
 ; ZVE32F-NEXT:    vslide1down.vx v9, v8, a0
 ; ZVE32F-NEXT:    li a0, 1
-; ZVE32F-NEXT:    vslide1down.vx v9, v9, a0
-; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
+; ZVE32F-NEXT:    vmv.v.i v0, 15
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; ZVE32F-NEXT:    vslide1down.vx v9, v9, a0
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, zero
-; ZVE32F-NEXT:    vmv.v.i v0, 15
+; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
 ; ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; ZVE32F-NEXT:    vand.vi v8, v8, 1
@@ -370,12 +370,12 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1(i1 %x, i1 %y) optsize {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vslide1down.vx v9, v8, a0
-; CHECK-NEXT:    vslide1down.vx v9, v9, a1
-; CHECK-NEXT:    vslide1down.vx v9, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
+; CHECK-NEXT:    vslide1down.vx v9, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
-; CHECK-NEXT:    vmv.v.i v0, 15
+; CHECK-NEXT:    vslide1down.vx v9, v9, a1
 ; CHECK-NEXT:    vslide1down.vx v8, v8, a1
 ; CHECK-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 1
@@ -386,12 +386,12 @@ define <8 x i1> @buildvec_mask_optsize_nonconst_v8i1(i1 %x, i1 %y) optsize {
 ; ZVE32F:       # %bb.0:
 ; ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; ZVE32F-NEXT:    vmv.v.x v8, a0
+; ZVE32F-NEXT:    vmv.v.i v0, 15
 ; ZVE32F-NEXT:    vslide1down.vx v9, v8, a0
-; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
-; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
-; ZVE32F-NEXT:    vmv.v.i v0, 15
+; ZVE32F-NEXT:    vslide1down.vx v9, v9, a1
 ; ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
 ; ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; ZVE32F-NEXT:    vand.vi v8, v8, 1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index bb4589a46bf41e..6cc3f7e76797bd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -345,14 +345,14 @@ define <2 x i64> @mgather_v2i8_sextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x
 ; RV32ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, mu
 ; RV32ZVE32F-NEXT:    vluxei32.v v9, (zero), v8, v0.t
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    srai a2, a1, 31
-; RV32ZVE32F-NEXT:    vmv.x.s a3, v9
-; RV32ZVE32F-NEXT:    srai a4, a3, 31
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
-; RV32ZVE32F-NEXT:    sw a1, 8(a0)
-; RV32ZVE32F-NEXT:    sw a2, 12(a0)
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v9
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    srai a3, a1, 31
+; RV32ZVE32F-NEXT:    srai a4, a2, 31
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 8(a0)
+; RV32ZVE32F-NEXT:    sw a4, 12(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_v2i8_sextload_v2i64:
@@ -406,15 +406,15 @@ define <2 x i64> @mgather_v2i8_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, mu
 ; RV32ZVE32F-NEXT:    vluxei32.v v9, (zero), v8, v0.t
+; RV32ZVE32F-NEXT:    sw zero, 12(a0)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v9
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
 ; RV32ZVE32F-NEXT:    andi a1, a1, 255
-; RV32ZVE32F-NEXT:    vmv.x.s a2, v9
 ; RV32ZVE32F-NEXT:    andi a2, a2, 255
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
 ; RV32ZVE32F-NEXT:    sw zero, 4(a0)
-; RV32ZVE32F-NEXT:    sw a1, 8(a0)
-; RV32ZVE32F-NEXT:    sw zero, 12(a0)
+; RV32ZVE32F-NEXT:    sw a2, 8(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_v2i8_zextload_v2i64:
@@ -439,8 +439,8 @@ define <2 x i64> @mgather_v2i8_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2 x
 ; RV64ZVE32F-NEXT:  .LBB7_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV64ZVE32F-NEXT:    andi a0, a0, 255
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
+; RV64ZVE32F-NEXT:    andi a0, a0, 255
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-NEXT:    andi a1, a1, 255
 ; RV64ZVE32F-NEXT:    ret
@@ -1038,14 +1038,14 @@ define <2 x i64> @mgather_v2i16_sextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2
 ; RV32ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, mu
 ; RV32ZVE32F-NEXT:    vluxei32.v v9, (zero), v8, v0.t
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    srai a2, a1, 31
-; RV32ZVE32F-NEXT:    vmv.x.s a3, v9
-; RV32ZVE32F-NEXT:    srai a4, a3, 31
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
-; RV32ZVE32F-NEXT:    sw a1, 8(a0)
-; RV32ZVE32F-NEXT:    sw a2, 12(a0)
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v9
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV32ZVE32F-NEXT:    srai a3, a1, 31
+; RV32ZVE32F-NEXT:    srai a4, a2, 31
+; RV32ZVE32F-NEXT:    sw a1, 0(a0)
+; RV32ZVE32F-NEXT:    sw a3, 4(a0)
+; RV32ZVE32F-NEXT:    sw a2, 8(a0)
+; RV32ZVE32F-NEXT:    sw a4, 12(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_v2i16_sextload_v2i64:
@@ -1097,15 +1097,15 @@ define <2 x i64> @mgather_v2i16_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2
 ;
 ; RV32ZVE32F-LABEL: mgather_v2i16_zextload_v2i64:
 ; RV32ZVE32F:       # %bb.0:
+; RV32ZVE32F-NEXT:    lui a1, 16
 ; RV32ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, ta, mu
 ; RV32ZVE32F-NEXT:    vluxei32.v v9, (zero), v8, v0.t
+; RV32ZVE32F-NEXT:    addi a1, a1, -1
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    lui a2, 16
-; RV32ZVE32F-NEXT:    addi a2, a2, -1
-; RV32ZVE32F-NEXT:    and a1, a1, a2
-; RV32ZVE32F-NEXT:    vmv.x.s a3, v9
-; RV32ZVE32F-NEXT:    and a2, a3, a2
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
+; RV32ZVE32F-NEXT:    and a2, a2, a1
+; RV32ZVE32F-NEXT:    and a1, a3, a1
 ; RV32ZVE32F-NEXT:    sw a2, 0(a0)
 ; RV32ZVE32F-NEXT:    sw zero, 4(a0)
 ; RV32ZVE32F-NEXT:    sw a1, 8(a0)
@@ -1135,10 +1135,10 @@ define <2 x i64> @mgather_v2i16_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV64ZVE32F-NEXT:    lui a1, 16
-; RV64ZVE32F-NEXT:    addiw a1, a1, -1
-; RV64ZVE32F-NEXT:    and a0, a0, a1
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
+; RV64ZVE32F-NEXT:    addiw a1, a1, -1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
+; RV64ZVE32F-NEXT:    and a0, a0, a1
 ; RV64ZVE32F-NEXT:    and a1, a2, a1
 ; RV64ZVE32F-NEXT:    ret
   %v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> %ptrs, i32 2, <2 x i1> %m, <2 x i16> %passthru)
@@ -2096,19 +2096,19 @@ define <2 x i64> @mgather_v2i32_sextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2
 ;
 ; RV32ZVE32F-LABEL: mgather_v2i32_sextload_v2i64:
 ; RV32ZVE32F:       # %bb.0:
+; RV32ZVE32F-NEXT:    addi a1, a0, 8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
 ; RV32ZVE32F-NEXT:    vluxei32.v v9, (zero), v8, v0.t
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    srai a1, a1, 31
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV32ZVE32F-NEXT:    srai a2, a2, 31
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vse32.v v9, (a0)
-; RV32ZVE32F-NEXT:    addi a3, a0, 8
-; RV32ZVE32F-NEXT:    vse32.v v8, (a3)
+; RV32ZVE32F-NEXT:    vmv.x.s a3, v8
+; RV32ZVE32F-NEXT:    srai a2, a2, 31
+; RV32ZVE32F-NEXT:    vse32.v v8, (a1)
+; RV32ZVE32F-NEXT:    srai a3, a3, 31
 ; RV32ZVE32F-NEXT:    sw a2, 4(a0)
-; RV32ZVE32F-NEXT:    sw a1, 12(a0)
+; RV32ZVE32F-NEXT:    sw a3, 12(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_v2i32_sextload_v2i64:
@@ -2160,15 +2160,15 @@ define <2 x i64> @mgather_v2i32_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2
 ;
 ; RV32ZVE32F-LABEL: mgather_v2i32_zextload_v2i64:
 ; RV32ZVE32F:       # %bb.0:
+; RV32ZVE32F-NEXT:    addi a1, a0, 8
 ; RV32ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
 ; RV32ZVE32F-NEXT:    vluxei32.v v9, (zero), v8, v0.t
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
 ; RV32ZVE32F-NEXT:    sw zero, 4(a0)
 ; RV32ZVE32F-NEXT:    sw zero, 12(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vse32.v v9, (a0)
-; RV32ZVE32F-NEXT:    addi a0, a0, 8
-; RV32ZVE32F-NEXT:    vse32.v v8, (a0)
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v9, 1
+; RV32ZVE32F-NEXT:    vse32.v v8, (a1)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_v2i32_zextload_v2i64:
@@ -2193,10 +2193,10 @@ define <2 x i64> @mgather_v2i32_zextload_v2i64(<2 x ptr> %ptrs, <2 x i1> %m, <2
 ; RV64ZVE32F-NEXT:  .LBB30_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV64ZVE32F-NEXT:    slli a0, a0, 32
-; RV64ZVE32F-NEXT:    srli a0, a0, 32
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 1
+; RV64ZVE32F-NEXT:    slli a0, a0, 32
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
+; RV64ZVE32F-NEXT:    srli a0, a0, 32
 ; RV64ZVE32F-NEXT:    slli a1, a1, 32
 ; RV64ZVE32F-NEXT:    srli a1, a1, 32
 ; RV64ZVE32F-NEXT:    ret
@@ -3776,28 +3776,28 @@ define <4 x i64> @mgather_truemask_v4i64(<4 x ptr> %ptrs, <4 x i64> %passthru) {
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    lw a2, 0(a1)
-; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a3, v9
-; RV32ZVE32F-NEXT:    lw a4, 0(a3)
-; RV32ZVE32F-NEXT:    lw a3, 4(a3)
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v9
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s a5, v9
-; RV32ZVE32F-NEXT:    lw a6, 0(a5)
-; RV32ZVE32F-NEXT:    lw a5, 4(a5)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s a7, v8
-; RV32ZVE32F-NEXT:    lw t0, 0(a7)
-; RV32ZVE32F-NEXT:    lw a7, 4(a7)
-; RV32ZVE32F-NEXT:    sw a6, 16(a0)
-; RV32ZVE32F-NEXT:    sw a5, 20(a0)
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    vmv.x.s a4, v9
+; RV32ZVE32F-NEXT:    vmv.x.s a5, v8
+; RV32ZVE32F-NEXT:    lw a6, 0(a2)
+; RV32ZVE32F-NEXT:    lw a2, 4(a2)
+; RV32ZVE32F-NEXT:    lw a7, 0(a4)
+; RV32ZVE32F-NEXT:    lw a4, 4(a4)
+; RV32ZVE32F-NEXT:    lw t0, 0(a5)
+; RV32ZVE32F-NEXT:    lw a5, 4(a5)
+; RV32ZVE32F-NEXT:    sw a7, 16(a0)
+; RV32ZVE32F-NEXT:    sw a4, 20(a0)
 ; RV32ZVE32F-NEXT:    sw t0, 24(a0)
-; RV32ZVE32F-NEXT:    sw a7, 28(a0)
-; RV32ZVE32F-NEXT:    sw a2, 0(a0)
+; RV32ZVE32F-NEXT:    sw a5, 28(a0)
+; RV32ZVE32F-NEXT:    sw a3, 0(a0)
 ; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    sw a4, 8(a0)
-; RV32ZVE32F-NEXT:    sw a3, 12(a0)
+; RV32ZVE32F-NEXT:    sw a6, 8(a0)
+; RV32ZVE32F-NEXT:    sw a2, 12(a0)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mgather_truemask_v4i64:
@@ -4132,11 +4132,11 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB48_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
@@ -4409,11 +4409,11 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB49_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
@@ -4688,11 +4688,11 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB50_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
@@ -4974,11 +4974,11 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB51_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
@@ -5252,11 +5252,11 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB52_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
@@ -5532,11 +5532,11 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
-; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, t0, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    beqz a3, .LBB53_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
@@ -6666,6 +6666,9 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:    lw a2, 24(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.v.x v8, t0
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
@@ -6674,10 +6677,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s t0, v0
 ; RV32ZVE32F-NEXT:    andi a2, t0, 1
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    beqz a2, .LBB57_7
 ; RV32ZVE32F-NEXT:  # %bb.1: # %cond.load
@@ -6813,89 +6813,89 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV64ZVE32F-LABEL: mgather_baseidx_v8i64:
 ; RV64ZVE32F:       # %bb.0:
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a6, v0
-; RV64ZVE32F-NEXT:    andi a4, a6, 1
+; RV64ZVE32F-NEXT:    vmv.x.s a7, v0
+; RV64ZVE32F-NEXT:    andi a4, a7, 1
 ; RV64ZVE32F-NEXT:    beqz a4, .LBB57_9
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.load
 ; RV64ZVE32F-NEXT:    ld a4, 0(a2)
 ; RV64ZVE32F-NEXT:    slli a4, a4, 3
 ; RV64ZVE32F-NEXT:    add a4, a1, a4
 ; RV64ZVE32F-NEXT:    ld a4, 0(a4)
-; RV64ZVE32F-NEXT:    andi a5, a6, 2
+; RV64ZVE32F-NEXT:    andi a5, a7, 2
 ; RV64ZVE32F-NEXT:    bnez a5, .LBB57_10
 ; RV64ZVE32F-NEXT:  .LBB57_2:
 ; RV64ZVE32F-NEXT:    ld a5, 8(a3)
-; RV64ZVE32F-NEXT:    andi a7, a6, 4
-; RV64ZVE32F-NEXT:    bnez a7, .LBB57_11
+; RV64ZVE32F-NEXT:    andi a6, a7, 4
+; RV64ZVE32F-NEXT:    bnez a6, .LBB57_11
 ; RV64ZVE32F-NEXT:  .LBB57_3:
-; RV64ZVE32F-NEXT:    ld a7, 16(a3)
-; RV64ZVE32F-NEXT:    andi t0, a6, 8
+; RV64ZVE32F-NEXT:    ld a6, 16(a3)
+; RV64ZVE32F-NEXT:    andi t0, a7, 8
 ; RV64ZVE32F-NEXT:    bnez t0, .LBB57_12
 ; RV64ZVE32F-NEXT:  .LBB57_4:
 ; RV64ZVE32F-NEXT:    ld t0, 24(a3)
-; RV64ZVE32F-NEXT:    andi t1, a6, 16
+; RV64ZVE32F-NEXT:    andi t1, a7, 16
 ; RV64ZVE32F-NEXT:    bnez t1, .LBB57_13
 ; RV64ZVE32F-NEXT:  .LBB57_5:
 ; RV64ZVE32F-NEXT:    ld t1, 32(a3)
-; RV64ZVE32F-NEXT:    andi t2, a6, 32
+; RV64ZVE32F-NEXT:    andi t2, a7, 32
 ; RV64ZVE32F-NEXT:    bnez t2, .LBB57_14
 ; RV64ZVE32F-NEXT:  .LBB57_6:
 ; RV64ZVE32F-NEXT:    ld t2, 40(a3)
-; RV64ZVE32F-NEXT:    andi t3, a6, 64
+; RV64ZVE32F-NEXT:    andi t3, a7, 64
 ; RV64ZVE32F-NEXT:    bnez t3, .LBB57_15
 ; RV64ZVE32F-NEXT:  .LBB57_7:
 ; RV64ZVE32F-NEXT:    ld t3, 48(a3)
-; RV64ZVE32F-NEXT:    andi a6, a6, -128
-; RV64ZVE32F-NEXT:    bnez a6, .LBB57_16
+; RV64ZVE32F-NEXT:    andi a7, a7, -128
+; RV64ZVE32F-NEXT:    bnez a7, .LBB57_16
 ; RV64ZVE32F-NEXT:  .LBB57_8:
 ; RV64ZVE32F-NEXT:    ld a1, 56(a3)
 ; RV64ZVE32F-NEXT:    j .LBB57_17
 ; RV64ZVE32F-NEXT:  .LBB57_9:
 ; RV64ZVE32F-NEXT:    ld a4, 0(a3)
-; RV64ZVE32F-NEXT:    andi a5, a6, 2
+; RV64ZVE32F-NEXT:    andi a5, a7, 2
 ; RV64ZVE32F-NEXT:    beqz a5, .LBB57_2
 ; RV64ZVE32F-NEXT:  .LBB57_10: # %cond.load1
 ; RV64ZVE32F-NEXT:    ld a5, 8(a2)
 ; RV64ZVE32F-NEXT:    slli a5, a5, 3
 ; RV64ZVE32F-NEXT:    add a5, a1, a5
 ; RV64ZVE32F-NEXT:    ld a5, 0(a5)
-; RV64ZVE32F-NEXT:    andi a7, a6, 4
-; RV64ZVE32F-NEXT:    beqz a7, .LBB57_3
+; RV64ZVE32F-NEXT:    andi a6, a7, 4
+; RV64ZVE32F-NEXT:    beqz a6, .LBB57_3
 ; RV64ZVE32F-NEXT:  .LBB57_11: # %cond.load4
-; RV64ZVE32F-NEXT:    ld a7, 16(a2)
-; RV64ZVE32F-NEXT:    slli a7, a7, 3
-; RV64ZVE32F-NEXT:    add a7, a1, a7
-; RV64ZVE32F-NEXT:    ld a7, 0(a7)
-; RV64ZVE32F-NEXT:    andi t0, a6, 8
+; RV64ZVE32F-NEXT:    ld a6, 16(a2)
+; RV64ZVE32F-NEXT:    slli a6, a6, 3
+; RV64ZVE32F-NEXT:    add a6, a1, a6
+; RV64ZVE32F-NEXT:    ld a6, 0(a6)
+; RV64ZVE32F-NEXT:    andi t0, a7, 8
 ; RV64ZVE32F-NEXT:    beqz t0, .LBB57_4
 ; RV64ZVE32F-NEXT:  .LBB57_12: # %cond.load7
 ; RV64ZVE32F-NEXT:    ld t0, 24(a2)
 ; RV64ZVE32F-NEXT:    slli t0, t0, 3
 ; RV64ZVE32F-NEXT:    add t0, a1, t0
 ; RV64ZVE32F-NEXT:    ld t0, 0(t0)
-; RV64ZVE32F-NEXT:    andi t1, a6, 16
+; RV64ZVE32F-NEXT:    andi t1, a7, 16
 ; RV64ZVE32F-NEXT:    beqz t1, .LBB57_5
 ; RV64ZVE32F-NEXT:  .LBB57_13: # %cond.load10
 ; RV64ZVE32F-NEXT:    ld t1, 32(a2)
 ; RV64ZVE32F-NEXT:    slli t1, t1, 3
 ; RV64ZVE32F-NEXT:    add t1, a1, t1
 ; RV64ZVE32F-NEXT:    ld t1, 0(t1)
-; RV64ZVE32F-NEXT:    andi t2, a6, 32
+; RV64ZVE32F-NEXT:    andi t2, a7, 32
 ; RV64ZVE32F-NEXT:    beqz t2, .LBB57_6
 ; RV64ZVE32F-NEXT:  .LBB57_14: # %cond.load13
 ; RV64ZVE32F-NEXT:    ld t2, 40(a2)
 ; RV64ZVE32F-NEXT:    slli t2, t2, 3
 ; RV64ZVE32F-NEXT:    add t2, a1, t2
 ; RV64ZVE32F-NEXT:    ld t2, 0(t2)
-; RV64ZVE32F-NEXT:    andi t3, a6, 64
+; RV64ZVE32F-NEXT:    andi t3, a7, 64
 ; RV64ZVE32F-NEXT:    beqz t3, .LBB57_7
 ; RV64ZVE32F-NEXT:  .LBB57_15: # %cond.load16
 ; RV64ZVE32F-NEXT:    ld t3, 48(a2)
 ; RV64ZVE32F-NEXT:    slli t3, t3, 3
 ; RV64ZVE32F-NEXT:    add t3, a1, t3
 ; RV64ZVE32F-NEXT:    ld t3, 0(t3)
-; RV64ZVE32F-NEXT:    andi a6, a6, -128
-; RV64ZVE32F-NEXT:    beqz a6, .LBB57_8
+; RV64ZVE32F-NEXT:    andi a7, a7, -128
+; RV64ZVE32F-NEXT:    beqz a7, .LBB57_8
 ; RV64ZVE32F-NEXT:  .LBB57_16: # %cond.load19
 ; RV64ZVE32F-NEXT:    ld a2, 56(a2)
 ; RV64ZVE32F-NEXT:    slli a2, a2, 3
@@ -6904,7 +6904,7 @@ define <8 x i64> @mgather_baseidx_v8i64(ptr %base, <8 x i64> %idxs, <8 x i1> %m,
 ; RV64ZVE32F-NEXT:  .LBB57_17: # %else20
 ; RV64ZVE32F-NEXT:    sd a4, 0(a0)
 ; RV64ZVE32F-NEXT:    sd a5, 8(a0)
-; RV64ZVE32F-NEXT:    sd a7, 16(a0)
+; RV64ZVE32F-NEXT:    sd a6, 16(a0)
 ; RV64ZVE32F-NEXT:    sd t0, 24(a0)
 ; RV64ZVE32F-NEXT:    sd t1, 32(a0)
 ; RV64ZVE32F-NEXT:    sd t2, 40(a0)
@@ -11097,14 +11097,14 @@ define <4 x double> @mgather_truemask_v4f64(<4 x ptr> %ptrs, <4 x double> %passt
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV32ZVE32F-NEXT:    fld fa5, 0(a1)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
+; RV32ZVE32F-NEXT:    fld fa5, 0(a1)
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v9
-; RV32ZVE32F-NEXT:    fld fa4, 0(a1)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
+; RV32ZVE32F-NEXT:    fld fa4, 0(a1)
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v9
 ; RV32ZVE32F-NEXT:    fld fa3, 0(a1)
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v8
 ; RV32ZVE32F-NEXT:    fld fa2, 0(a1)
 ; RV32ZVE32F-NEXT:    fsd fa5, 0(a0)
@@ -11375,11 +11375,11 @@ define <8 x double> @mgather_baseidx_v8i8_v8f64(ptr %base, <8 x i8> %idxs, <8 x
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez a3, .LBB97_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -11590,11 +11590,11 @@ define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(ptr %base, <8 x i8> %idxs,
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez a3, .LBB98_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -11807,11 +11807,11 @@ define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs,
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez a3, .LBB99_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -12031,11 +12031,11 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez a3, .LBB100_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -12247,11 +12247,11 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez a3, .LBB101_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -12465,11 +12465,11 @@ define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
-; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a3, a2, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez a3, .LBB102_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -13348,21 +13348,21 @@ define <8 x double> @mgather_baseidx_v8f64(ptr %base, <8 x i64> %idxs, <8 x i1>
 ; RV32ZVE32F-NEXT:    lw a7, 0(a2)
 ; RV32ZVE32F-NEXT:    lw t0, 8(a2)
 ; RV32ZVE32F-NEXT:    lw t1, 16(a2)
-; RV32ZVE32F-NEXT:    lw a2, 24(a2)
+; RV32ZVE32F-NEXT:    lw t2, 24(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.v.x v8, a7
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
 ; RV32ZVE32F-NEXT:    andi a3, a2, 1
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez a3, .LBB106_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -13807,14 +13807,14 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64V-NEXT:    vsext.vf8 v16, v8
 ; RV64V-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
 ; RV64V-NEXT:    vslidedown.vi v12, v10, 16
+; RV64V-NEXT:    vslidedown.vi v14, v8, 16
+; RV64V-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64V-NEXT:    vslidedown.vi v8, v0, 2
 ; RV64V-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; RV64V-NEXT:    vluxei64.v v10, (a0), v16, v0.t
-; RV64V-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
-; RV64V-NEXT:    vslidedown.vi v8, v8, 16
-; RV64V-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64V-NEXT:    vslidedown.vi v0, v0, 2
-; RV64V-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64V-NEXT:    vsext.vf8 v16, v8
+; RV64V-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64V-NEXT:    vsext.vf8 v16, v14
+; RV64V-NEXT:    vmv1r.v v0, v8
 ; RV64V-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; RV64V-NEXT:    vluxei64.v v12, (a0), v16, v0.t
 ; RV64V-NEXT:    li a0, 32
@@ -14384,65 +14384,65 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) {
 ; RV32-NEXT:    vsll.vi v8, v8, 2
 ; RV32-NEXT:    vadd.vx v8, v8, a0
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    lbu a1, 0(a0)
-; RV32-NEXT:    lbu a0, 1(a0)
 ; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vmv.x.s a2, v10
-; RV32-NEXT:    lbu a3, 1(a2)
-; RV32-NEXT:    lbu a2, 0(a2)
-; RV32-NEXT:    slli a0, a0, 8
-; RV32-NEXT:    or a0, a0, a1
-; RV32-NEXT:    slli a3, a3, 8
-; RV32-NEXT:    vslidedown.vi v10, v8, 2
+; RV32-NEXT:    vslidedown.vi v11, v8, 2
 ; RV32-NEXT:    vmv.x.s a1, v10
-; RV32-NEXT:    lbu a4, 0(a1)
-; RV32-NEXT:    lbu a1, 1(a1)
 ; RV32-NEXT:    vslidedown.vi v10, v8, 3
-; RV32-NEXT:    vmv.x.s a5, v10
-; RV32-NEXT:    lbu a6, 0(a5)
-; RV32-NEXT:    lbu a5, 1(a5)
-; RV32-NEXT:    or a2, a3, a2
-; RV32-NEXT:    slli a1, a1, 8
-; RV32-NEXT:    or a1, a1, a4
-; RV32-NEXT:    slli a5, a5, 8
+; RV32-NEXT:    vmv.x.s a2, v11
+; RV32-NEXT:    vmv.x.s a3, v10
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v10, v8, 4
-; RV32-NEXT:    vmv.x.s a3, v10
-; RV32-NEXT:    lbu a4, 0(a3)
-; RV32-NEXT:    lbu a3, 1(a3)
+; RV32-NEXT:    vmv.x.s a4, v10
 ; RV32-NEXT:    vslidedown.vi v10, v8, 5
-; RV32-NEXT:    vmv.x.s a7, v10
-; RV32-NEXT:    lbu t0, 0(a7)
-; RV32-NEXT:    lbu a7, 1(a7)
-; RV32-NEXT:    or a5, a5, a6
-; RV32-NEXT:    slli a3, a3, 8
-; RV32-NEXT:    or a3, a3, a4
-; RV32-NEXT:    slli a7, a7, 8
+; RV32-NEXT:    vmv.x.s a5, v10
 ; RV32-NEXT:    vslidedown.vi v10, v8, 6
-; RV32-NEXT:    vmv.x.s a4, v10
-; RV32-NEXT:    lbu a6, 0(a4)
-; RV32-NEXT:    lbu a4, 1(a4)
 ; RV32-NEXT:    vslidedown.vi v8, v8, 7
-; RV32-NEXT:    vmv.x.s t1, v8
-; RV32-NEXT:    lbu t2, 0(t1)
-; RV32-NEXT:    lbu t1, 1(t1)
-; RV32-NEXT:    or a7, a7, t0
+; RV32-NEXT:    lbu a6, 0(a0)
+; RV32-NEXT:    lbu a0, 1(a0)
+; RV32-NEXT:    vmv.x.s a7, v10
+; RV32-NEXT:    vmv.x.s t0, v8
+; RV32-NEXT:    lbu t1, 0(a1)
+; RV32-NEXT:    lbu a1, 1(a1)
+; RV32-NEXT:    lbu t2, 0(a2)
+; RV32-NEXT:    lbu a2, 1(a2)
+; RV32-NEXT:    slli a0, a0, 8
+; RV32-NEXT:    or a0, a0, a6
+; RV32-NEXT:    lbu a6, 0(a3)
+; RV32-NEXT:    lbu a3, 1(a3)
+; RV32-NEXT:    slli a1, a1, 8
+; RV32-NEXT:    or a1, a1, t1
+; RV32-NEXT:    lbu t1, 0(a4)
+; RV32-NEXT:    lbu a4, 1(a4)
+; RV32-NEXT:    slli a2, a2, 8
+; RV32-NEXT:    or a2, a2, t2
+; RV32-NEXT:    lbu t2, 0(a5)
+; RV32-NEXT:    lbu a5, 1(a5)
+; RV32-NEXT:    slli a3, a3, 8
+; RV32-NEXT:    or a3, a3, a6
+; RV32-NEXT:    lbu a6, 0(a7)
+; RV32-NEXT:    lbu a7, 1(a7)
 ; RV32-NEXT:    slli a4, a4, 8
-; RV32-NEXT:    or a4, a4, a6
-; RV32-NEXT:    slli t1, t1, 8
-; RV32-NEXT:    or a6, t1, t2
+; RV32-NEXT:    or a4, a4, t1
+; RV32-NEXT:    lbu t1, 0(t0)
+; RV32-NEXT:    lbu t0, 1(t0)
+; RV32-NEXT:    slli a5, a5, 8
+; RV32-NEXT:    or a5, a5, t2
+; RV32-NEXT:    slli a7, a7, 8
+; RV32-NEXT:    or a6, a7, a6
+; RV32-NEXT:    slli t0, t0, 8
+; RV32-NEXT:    or a7, t0, t1
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    vslide1down.vx v9, v8, a5
-; RV32-NEXT:    vmv.v.x v8, a3
-; RV32-NEXT:    vslide1down.vx v8, v8, a7
-; RV32-NEXT:    vslide1down.vx v8, v8, a4
+; RV32-NEXT:    vmv.v.x v9, a4
+; RV32-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-NEXT:    vslide1down.vx v9, v9, a5
+; RV32-NEXT:    vslide1down.vx v10, v8, a3
+; RV32-NEXT:    vslide1down.vx v8, v9, a6
 ; RV32-NEXT:    vmv.v.i v0, 15
-; RV32-NEXT:    vslide1down.vx v8, v8, a6
-; RV32-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV32-NEXT:    vslide1down.vx v8, v8, a7
+; RV32-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64V-LABEL: mgather_strided_unaligned:
@@ -14458,65 +14458,65 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) {
 ; RV64V-NEXT:    andi sp, sp, -64
 ; RV64V-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64V-NEXT:    vid.v v8
+; RV64V-NEXT:    mv a1, sp
 ; RV64V-NEXT:    vsll.vi v8, v8, 2
 ; RV64V-NEXT:    vadd.vx v8, v8, a0
 ; RV64V-NEXT:    vmv.x.s a0, v8
-; RV64V-NEXT:    lbu a1, 0(a0)
-; RV64V-NEXT:    lbu a0, 1(a0)
 ; RV64V-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64V-NEXT:    vslidedown.vi v12, v8, 1
 ; RV64V-NEXT:    vmv.x.s a2, v12
-; RV64V-NEXT:    lbu a3, 1(a2)
-; RV64V-NEXT:    lbu a2, 0(a2)
-; RV64V-NEXT:    slli a0, a0, 8
-; RV64V-NEXT:    or a0, a0, a1
-; RV64V-NEXT:    slli a3, a3, 8
 ; RV64V-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV64V-NEXT:    vslidedown.vi v12, v8, 2
-; RV64V-NEXT:    vmv.x.s a1, v12
-; RV64V-NEXT:    lbu a4, 0(a1)
-; RV64V-NEXT:    lbu a1, 1(a1)
+; RV64V-NEXT:    vmv.x.s a3, v12
 ; RV64V-NEXT:    vslidedown.vi v12, v8, 3
+; RV64V-NEXT:    lbu a4, 0(a0)
+; RV64V-NEXT:    lbu a0, 1(a0)
 ; RV64V-NEXT:    vmv.x.s a5, v12
-; RV64V-NEXT:    lbu a6, 0(a5)
+; RV64V-NEXT:    lbu a6, 0(a2)
+; RV64V-NEXT:    lbu a2, 1(a2)
+; RV64V-NEXT:    lbu a7, 0(a3)
+; RV64V-NEXT:    lbu a3, 1(a3)
+; RV64V-NEXT:    lbu t0, 0(a5)
 ; RV64V-NEXT:    lbu a5, 1(a5)
-; RV64V-NEXT:    or a2, a3, a2
-; RV64V-NEXT:    slli a1, a1, 8
-; RV64V-NEXT:    or a1, a1, a4
-; RV64V-NEXT:    slli a5, a5, 8
-; RV64V-NEXT:    mv a3, sp
 ; RV64V-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64V-NEXT:    vse64.v v8, (a3)
+; RV64V-NEXT:    vse64.v v8, (a1)
+; RV64V-NEXT:    slli a0, a0, 8
+; RV64V-NEXT:    or a0, a0, a4
+; RV64V-NEXT:    slli a2, a2, 8
+; RV64V-NEXT:    slli a3, a3, 8
+; RV64V-NEXT:    or a1, a2, a6
+; RV64V-NEXT:    or a2, a3, a7
 ; RV64V-NEXT:    ld a3, 32(sp)
 ; RV64V-NEXT:    ld a4, 40(sp)
-; RV64V-NEXT:    ld a7, 48(sp)
-; RV64V-NEXT:    ld t0, 56(sp)
-; RV64V-NEXT:    lbu t1, 0(a3)
+; RV64V-NEXT:    ld a6, 48(sp)
+; RV64V-NEXT:    ld a7, 56(sp)
+; RV64V-NEXT:    slli a5, a5, 8
+; RV64V-NEXT:    or a5, a5, t0
+; RV64V-NEXT:    lbu t0, 0(a3)
 ; RV64V-NEXT:    lbu a3, 1(a3)
-; RV64V-NEXT:    lbu t2, 0(a4)
+; RV64V-NEXT:    vmv.v.x v8, a0
+; RV64V-NEXT:    lbu a0, 0(a4)
 ; RV64V-NEXT:    lbu a4, 1(a4)
-; RV64V-NEXT:    or a5, a5, a6
+; RV64V-NEXT:    vslide1down.vx v8, v8, a1
+; RV64V-NEXT:    lbu a1, 0(a6)
+; RV64V-NEXT:    lbu a6, 1(a6)
+; RV64V-NEXT:    vslide1down.vx v8, v8, a2
+; RV64V-NEXT:    lbu a2, 0(a7)
+; RV64V-NEXT:    lbu a7, 1(a7)
+; RV64V-NEXT:    vslide1down.vx v9, v8, a5
 ; RV64V-NEXT:    slli a3, a3, 8
-; RV64V-NEXT:    or a3, a3, t1
 ; RV64V-NEXT:    slli a4, a4, 8
-; RV64V-NEXT:    lbu a6, 0(a7)
-; RV64V-NEXT:    lbu a7, 1(a7)
-; RV64V-NEXT:    lbu t1, 0(t0)
-; RV64V-NEXT:    lbu t0, 1(t0)
-; RV64V-NEXT:    or a4, a4, t2
+; RV64V-NEXT:    slli a6, a6, 8
 ; RV64V-NEXT:    slli a7, a7, 8
-; RV64V-NEXT:    or a6, a7, a6
-; RV64V-NEXT:    slli t0, t0, 8
-; RV64V-NEXT:    or a7, t0, t1
-; RV64V-NEXT:    vmv.v.x v8, a0
-; RV64V-NEXT:    vslide1down.vx v8, v8, a2
-; RV64V-NEXT:    vslide1down.vx v8, v8, a1
-; RV64V-NEXT:    vslide1down.vx v9, v8, a5
+; RV64V-NEXT:    or a3, a3, t0
+; RV64V-NEXT:    or a0, a4, a0
+; RV64V-NEXT:    or a1, a6, a1
+; RV64V-NEXT:    or a2, a7, a2
 ; RV64V-NEXT:    vmv.v.x v8, a3
-; RV64V-NEXT:    vslide1down.vx v8, v8, a4
-; RV64V-NEXT:    vslide1down.vx v8, v8, a6
+; RV64V-NEXT:    vslide1down.vx v8, v8, a0
+; RV64V-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64V-NEXT:    vmv.v.i v0, 15
-; RV64V-NEXT:    vslide1down.vx v8, v8, a7
+; RV64V-NEXT:    vslide1down.vx v8, v8, a2
 ; RV64V-NEXT:    vslidedown.vi v8, v9, 4, v0.t
 ; RV64V-NEXT:    addi sp, s0, -128
 ; RV64V-NEXT:    .cfi_def_cfa sp, 128
@@ -14530,49 +14530,49 @@ define <8 x i16> @mgather_strided_unaligned(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_strided_unaligned:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lbu a1, 1(a0)
-; RV64ZVE32F-NEXT:    lbu a2, 0(a0)
-; RV64ZVE32F-NEXT:    lbu a3, 5(a0)
-; RV64ZVE32F-NEXT:    lbu a4, 4(a0)
-; RV64ZVE32F-NEXT:    slli a1, a1, 8
-; RV64ZVE32F-NEXT:    or a1, a1, a2
-; RV64ZVE32F-NEXT:    slli a3, a3, 8
-; RV64ZVE32F-NEXT:    lbu a2, 8(a0)
-; RV64ZVE32F-NEXT:    lbu a5, 9(a0)
-; RV64ZVE32F-NEXT:    lbu a6, 12(a0)
-; RV64ZVE32F-NEXT:    lbu a7, 13(a0)
-; RV64ZVE32F-NEXT:    or a3, a3, a4
-; RV64ZVE32F-NEXT:    slli a5, a5, 8
-; RV64ZVE32F-NEXT:    or a2, a5, a2
-; RV64ZVE32F-NEXT:    slli a7, a7, 8
-; RV64ZVE32F-NEXT:    lbu a4, 16(a0)
-; RV64ZVE32F-NEXT:    lbu a5, 17(a0)
-; RV64ZVE32F-NEXT:    lbu t0, 20(a0)
-; RV64ZVE32F-NEXT:    lbu t1, 21(a0)
-; RV64ZVE32F-NEXT:    or a6, a7, a6
-; RV64ZVE32F-NEXT:    slli a5, a5, 8
-; RV64ZVE32F-NEXT:    or a4, a5, a4
-; RV64ZVE32F-NEXT:    slli t1, t1, 8
-; RV64ZVE32F-NEXT:    lbu a5, 24(a0)
+; RV64ZVE32F-NEXT:    lbu a1, 0(a0)
+; RV64ZVE32F-NEXT:    lbu a2, 1(a0)
+; RV64ZVE32F-NEXT:    lbu a3, 4(a0)
+; RV64ZVE32F-NEXT:    lbu a4, 5(a0)
+; RV64ZVE32F-NEXT:    lbu a5, 8(a0)
+; RV64ZVE32F-NEXT:    lbu a6, 9(a0)
+; RV64ZVE32F-NEXT:    lbu a7, 12(a0)
+; RV64ZVE32F-NEXT:    lbu t0, 13(a0)
+; RV64ZVE32F-NEXT:    slli a2, a2, 8
+; RV64ZVE32F-NEXT:    slli a4, a4, 8
+; RV64ZVE32F-NEXT:    or a1, a2, a1
+; RV64ZVE32F-NEXT:    or a3, a4, a3
+; RV64ZVE32F-NEXT:    lbu a2, 16(a0)
+; RV64ZVE32F-NEXT:    lbu a4, 17(a0)
+; RV64ZVE32F-NEXT:    lbu t1, 20(a0)
+; RV64ZVE32F-NEXT:    lbu t2, 21(a0)
+; RV64ZVE32F-NEXT:    slli a6, a6, 8
+; RV64ZVE32F-NEXT:    or a5, a6, a5
+; RV64ZVE32F-NEXT:    slli t0, t0, 8
+; RV64ZVE32F-NEXT:    slli a4, a4, 8
+; RV64ZVE32F-NEXT:    slli t2, t2, 8
+; RV64ZVE32F-NEXT:    or a6, t0, a7
+; RV64ZVE32F-NEXT:    or a2, a4, a2
+; RV64ZVE32F-NEXT:    lbu a4, 24(a0)
 ; RV64ZVE32F-NEXT:    lbu a7, 25(a0)
-; RV64ZVE32F-NEXT:    lbu t2, 28(a0)
+; RV64ZVE32F-NEXT:    or t0, t2, t1
+; RV64ZVE32F-NEXT:    lbu t1, 28(a0)
 ; RV64ZVE32F-NEXT:    lbu a0, 29(a0)
-; RV64ZVE32F-NEXT:    or t0, t1, t0
 ; RV64ZVE32F-NEXT:    slli a7, a7, 8
-; RV64ZVE32F-NEXT:    or a5, a7, a5
-; RV64ZVE32F-NEXT:    slli a0, a0, 8
-; RV64ZVE32F-NEXT:    or a0, a0, t2
+; RV64ZVE32F-NEXT:    or a4, a7, a4
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
+; RV64ZVE32F-NEXT:    slli a0, a0, 8
+; RV64ZVE32F-NEXT:    or a0, a0, t1
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a2
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a6
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, t0
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 1, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -14606,16 +14606,16 @@ define <8 x i16> @mgather_strided_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 24(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 26(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a6
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -14651,16 +14651,16 @@ define <8 x i16> @mgather_strided_2xSEW_with_offset(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 28(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 30(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a6
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 2, i64 3, i64 6, i64 7, i64 10, i64 11, i64 14, i64 15>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -14696,16 +14696,16 @@ define <8 x i16> @mgather_reverse_unit_strided_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 20(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 22(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a3
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a7
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a0
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a2
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a7
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a6
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 14, i64 15, i64 12, i64 13, i64 10, i64 11, i64 8, i64 9>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -14741,16 +14741,16 @@ define <8 x i16> @mgather_reverse_strided_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 12(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 14(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a3
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a7
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a0
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a2
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a7
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a5
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a6
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 14, i64 15, i64 10, i64 11, i64 6, i64 7, i64 2, i64 3>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -14785,16 +14785,16 @@ define <8 x i16> @mgather_gather_2xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 4(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 6(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a1
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a2
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 2, i32 3>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -14832,16 +14832,16 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 4(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 6(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a1
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a2
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 9, i32 10, i32 4, i32 5, i32 2, i32 3>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 2, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -14871,24 +14871,24 @@ define <8 x i16> @mgather_gather_2xSEW_unaligned2(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: mgather_gather_2xSEW_unaligned2:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    lh a1, 10(a0)
-; RV64ZVE32F-NEXT:    lh a2, 18(a0)
-; RV64ZVE32F-NEXT:    lh a3, 20(a0)
-; RV64ZVE32F-NEXT:    lh a4, 2(a0)
-; RV64ZVE32F-NEXT:    lh a5, 4(a0)
-; RV64ZVE32F-NEXT:    lh a6, 6(a0)
-; RV64ZVE32F-NEXT:    lh a0, 8(a0)
+; RV64ZVE32F-NEXT:    lh a1, 2(a0)
+; RV64ZVE32F-NEXT:    lh a2, 4(a0)
+; RV64ZVE32F-NEXT:    lh a3, 6(a0)
+; RV64ZVE32F-NEXT:    lh a4, 8(a0)
+; RV64ZVE32F-NEXT:    lh a5, 10(a0)
+; RV64ZVE32F-NEXT:    lh a6, 18(a0)
+; RV64ZVE32F-NEXT:    lh a0, 20(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a3
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a0
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
+; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a0
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a3
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 1, i32 2, i32 9, i32 10, i32 4, i32 5, i32 2, i32 3>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -14930,16 +14930,16 @@ define <8 x i16> @mgather_gather_4xSEW(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 20(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 22(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a6
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -14978,16 +14978,16 @@ define <8 x i16> @mgather_gather_4xSEW_partial_align(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 20(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 22(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a6
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i32>  <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -15035,16 +15035,16 @@ define <8 x i16> @mgather_shuffle_rotate(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 4(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 6(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a6
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a4
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a4
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 4, i64 5, i64 6, i64 7, i64 0, i64 1, i64 2, i64 3>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -15083,16 +15083,16 @@ define <8 x i16> @mgather_shuffle_vrgather(ptr %base) {
 ; RV64ZVE32F-NEXT:    lh a7, 12(a0)
 ; RV64ZVE32F-NEXT:    lh a0, 14(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
+; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
 ; RV64ZVE32F-NEXT:    vmv.v.x v8, a1
+; RV64ZVE32F-NEXT:    vmv.v.x v9, a5
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a6
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
-; RV64ZVE32F-NEXT:    vslide1down.vx v9, v8, a2
-; RV64ZVE32F-NEXT:    vmv.v.x v8, a5
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a6
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
-; RV64ZVE32F-NEXT:    vmv.v.i v0, 15
-; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v9, 4, v0.t
+; RV64ZVE32F-NEXT:    vslide1down.vx v9, v9, a7
+; RV64ZVE32F-NEXT:    vslide1down.vx v10, v8, a2
+; RV64ZVE32F-NEXT:    vslide1down.vx v8, v9, a0
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v10, 4, v0.t
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 0, i64 2, i64 3, i64 1, i64 4, i64 5, i64 6, i64 7>
   %v = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> splat (i1 true), <8 x i16> poison)
@@ -15152,258 +15152,258 @@ define <32 x i64> @mgather_strided_split(ptr %base) {
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa s0, 0
 ; RV32ZVE32F-NEXT:    andi sp, sp, -128
 ; RV32ZVE32F-NEXT:    li a2, 32
+; RV32ZVE32F-NEXT:    lw a3, 0(a1)
+; RV32ZVE32F-NEXT:    sw a3, 236(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a3, 4(a1)
+; RV32ZVE32F-NEXT:    sw a3, 232(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    addi a3, sp, 256
 ; RV32ZVE32F-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32ZVE32F-NEXT:    vid.v v8
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 4
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 252(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a1, 248(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 1
+; RV32ZVE32F-NEXT:    vslidedown.vi v17, v8, 2
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 244(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a1, 240(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 220(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a1, 216(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 212(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a1, 208(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vmv.x.s a4, v17
+; RV32ZVE32F-NEXT:    vmv.x.s a5, v16
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 4
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 236(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a1, 232(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vmv.x.s a6, v16
 ; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 5
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 228(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a1, 224(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vmv.x.s a7, v16
 ; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 6
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 204(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a1, 200(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vmv.x.s t0, v16
 ; RV32ZVE32F-NEXT:    vslidedown.vi v16, v8, 7
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v16
-; RV32ZVE32F-NEXT:    lw a3, 0(a1)
-; RV32ZVE32F-NEXT:    sw a3, 196(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    vmv.x.s t1, v16
+; RV32ZVE32F-NEXT:    lw t2, 0(a1)
+; RV32ZVE32F-NEXT:    sw t2, 196(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a1)
 ; RV32ZVE32F-NEXT:    sw a1, 192(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    addi a1, sp, 256
+; RV32ZVE32F-NEXT:    lw ra, 0(a4)
+; RV32ZVE32F-NEXT:    lw a1, 4(a4)
+; RV32ZVE32F-NEXT:    sw a1, 172(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(a5)
+; RV32ZVE32F-NEXT:    sw a1, 168(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a5)
+; RV32ZVE32F-NEXT:    sw a1, 164(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(a6)
+; RV32ZVE32F-NEXT:    sw a1, 252(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a6)
+; RV32ZVE32F-NEXT:    sw a1, 248(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(a7)
+; RV32ZVE32F-NEXT:    sw a1, 244(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a7)
+; RV32ZVE32F-NEXT:    sw a1, 240(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(t0)
+; RV32ZVE32F-NEXT:    sw a1, 188(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(t0)
+; RV32ZVE32F-NEXT:    sw a1, 184(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(t1)
+; RV32ZVE32F-NEXT:    sw a1, 180(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(t1)
+; RV32ZVE32F-NEXT:    sw a1, 176(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32ZVE32F-NEXT:    vse32.v v8, (a1)
+; RV32ZVE32F-NEXT:    vse32.v v8, (a3)
 ; RV32ZVE32F-NEXT:    lw a1, 288(sp)
 ; RV32ZVE32F-NEXT:    lw a2, 292(sp)
 ; RV32ZVE32F-NEXT:    lw a3, 296(sp)
 ; RV32ZVE32F-NEXT:    lw a4, 300(sp)
 ; RV32ZVE32F-NEXT:    lw a5, 0(a1)
-; RV32ZVE32F-NEXT:    sw a5, 188(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a5, 228(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a1, 184(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 224(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 0(a2)
-; RV32ZVE32F-NEXT:    sw a1, 180(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 220(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    sw a1, 176(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 216(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 0(a3)
-; RV32ZVE32F-NEXT:    sw a1, 172(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 212(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    sw a1, 168(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 208(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 0(a4)
-; RV32ZVE32F-NEXT:    sw a1, 164(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 204(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a4)
-; RV32ZVE32F-NEXT:    sw a1, 160(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 200(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 304(sp)
 ; RV32ZVE32F-NEXT:    lw a2, 308(sp)
 ; RV32ZVE32F-NEXT:    lw a3, 312(sp)
 ; RV32ZVE32F-NEXT:    lw a4, 316(sp)
 ; RV32ZVE32F-NEXT:    lw a5, 0(a1)
-; RV32ZVE32F-NEXT:    sw a5, 156(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a5, 160(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a1)
-; RV32ZVE32F-NEXT:    sw a1, 152(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 156(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 0(a2)
-; RV32ZVE32F-NEXT:    sw a1, 148(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 152(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a2)
-; RV32ZVE32F-NEXT:    sw a1, 144(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 148(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 0(a3)
-; RV32ZVE32F-NEXT:    sw a1, 140(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 144(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a3)
-; RV32ZVE32F-NEXT:    sw a1, 136(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 140(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 0(a4)
-; RV32ZVE32F-NEXT:    sw a1, 132(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 136(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 4(a4)
-; RV32ZVE32F-NEXT:    sw a1, 128(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw a1, 132(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw a1, 320(sp)
 ; RV32ZVE32F-NEXT:    lw a2, 324(sp)
 ; RV32ZVE32F-NEXT:    lw a3, 328(sp)
 ; RV32ZVE32F-NEXT:    lw a4, 332(sp)
-; RV32ZVE32F-NEXT:    lw s8, 0(a1)
-; RV32ZVE32F-NEXT:    lw s9, 4(a1)
-; RV32ZVE32F-NEXT:    lw s10, 0(a2)
-; RV32ZVE32F-NEXT:    lw s11, 4(a2)
+; RV32ZVE32F-NEXT:    lw a5, 0(a1)
+; RV32ZVE32F-NEXT:    sw a5, 128(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 124(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    sw a1, 120(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a2)
+; RV32ZVE32F-NEXT:    sw a1, 116(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw s8, 0(a3)
+; RV32ZVE32F-NEXT:    lw s9, 4(a3)
+; RV32ZVE32F-NEXT:    lw s10, 0(a4)
+; RV32ZVE32F-NEXT:    lw s11, 4(a4)
+; RV32ZVE32F-NEXT:    lw a1, 336(sp)
+; RV32ZVE32F-NEXT:    lw a2, 340(sp)
+; RV32ZVE32F-NEXT:    lw a3, 344(sp)
+; RV32ZVE32F-NEXT:    lw a4, 348(sp)
+; RV32ZVE32F-NEXT:    lw t5, 0(a1)
+; RV32ZVE32F-NEXT:    lw t6, 4(a1)
+; RV32ZVE32F-NEXT:    lw s2, 0(a2)
+; RV32ZVE32F-NEXT:    lw s3, 4(a2)
+; RV32ZVE32F-NEXT:    lw a5, 0(a3)
+; RV32ZVE32F-NEXT:    lw a6, 4(a3)
+; RV32ZVE32F-NEXT:    lw a7, 0(a4)
+; RV32ZVE32F-NEXT:    lw t0, 4(a4)
+; RV32ZVE32F-NEXT:    lw a1, 352(sp)
+; RV32ZVE32F-NEXT:    lw a2, 356(sp)
+; RV32ZVE32F-NEXT:    lw a3, 360(sp)
+; RV32ZVE32F-NEXT:    lw a4, 364(sp)
+; RV32ZVE32F-NEXT:    lw t1, 0(a1)
+; RV32ZVE32F-NEXT:    sw t1, 112(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a1)
+; RV32ZVE32F-NEXT:    sw a1, 108(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 0(a2)
+; RV32ZVE32F-NEXT:    sw a1, 104(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    lw a1, 4(a2)
+; RV32ZVE32F-NEXT:    sw a1, 100(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    lw s4, 0(a3)
 ; RV32ZVE32F-NEXT:    lw s5, 4(a3)
 ; RV32ZVE32F-NEXT:    lw s6, 0(a4)
 ; RV32ZVE32F-NEXT:    lw s7, 4(a4)
-; RV32ZVE32F-NEXT:    lw a2, 336(sp)
-; RV32ZVE32F-NEXT:    lw a4, 340(sp)
-; RV32ZVE32F-NEXT:    lw a5, 344(sp)
-; RV32ZVE32F-NEXT:    lw a6, 348(sp)
-; RV32ZVE32F-NEXT:    lw a7, 0(a2)
-; RV32ZVE32F-NEXT:    lw t0, 4(a2)
-; RV32ZVE32F-NEXT:    lw t1, 0(a4)
-; RV32ZVE32F-NEXT:    lw t2, 4(a4)
-; RV32ZVE32F-NEXT:    lw a1, 0(a5)
-; RV32ZVE32F-NEXT:    lw a2, 4(a5)
-; RV32ZVE32F-NEXT:    lw a3, 0(a6)
-; RV32ZVE32F-NEXT:    lw a4, 4(a6)
-; RV32ZVE32F-NEXT:    lw a5, 352(sp)
-; RV32ZVE32F-NEXT:    lw a6, 356(sp)
-; RV32ZVE32F-NEXT:    lw t3, 360(sp)
-; RV32ZVE32F-NEXT:    lw t4, 364(sp)
-; RV32ZVE32F-NEXT:    lw t5, 0(a5)
-; RV32ZVE32F-NEXT:    sw t5, 116(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a5, 4(a5)
-; RV32ZVE32F-NEXT:    sw a5, 112(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a5, 0(a6)
-; RV32ZVE32F-NEXT:    sw a5, 124(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a5, 4(a6)
-; RV32ZVE32F-NEXT:    sw a5, 120(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw ra, 0(t3)
-; RV32ZVE32F-NEXT:    lw a5, 4(t3)
-; RV32ZVE32F-NEXT:    sw a5, 108(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a5, 0(t4)
-; RV32ZVE32F-NEXT:    sw a5, 104(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a5, 4(t4)
-; RV32ZVE32F-NEXT:    sw a5, 100(sp) # 4-byte Folded Spill
-; RV32ZVE32F-NEXT:    lw a5, 368(sp)
-; RV32ZVE32F-NEXT:    lw a6, 372(sp)
-; RV32ZVE32F-NEXT:    lw t3, 376(sp)
-; RV32ZVE32F-NEXT:    lw t4, 380(sp)
-; RV32ZVE32F-NEXT:    lw t5, 0(a5)
-; RV32ZVE32F-NEXT:    lw t6, 4(a5)
-; RV32ZVE32F-NEXT:    lw s2, 0(a6)
-; RV32ZVE32F-NEXT:    lw s3, 4(a6)
-; RV32ZVE32F-NEXT:    lw a5, 0(t3)
-; RV32ZVE32F-NEXT:    lw a6, 4(t3)
-; RV32ZVE32F-NEXT:    lw t3, 0(t4)
-; RV32ZVE32F-NEXT:    lw t4, 4(t4)
-; RV32ZVE32F-NEXT:    sw a1, 176(a0)
-; RV32ZVE32F-NEXT:    sw a2, 180(a0)
-; RV32ZVE32F-NEXT:    sw a3, 184(a0)
-; RV32ZVE32F-NEXT:    sw a4, 188(a0)
-; RV32ZVE32F-NEXT:    sw a7, 160(a0)
-; RV32ZVE32F-NEXT:    sw t0, 164(a0)
-; RV32ZVE32F-NEXT:    sw t1, 168(a0)
-; RV32ZVE32F-NEXT:    sw t2, 172(a0)
-; RV32ZVE32F-NEXT:    sw s4, 144(a0)
-; RV32ZVE32F-NEXT:    sw s5, 148(a0)
-; RV32ZVE32F-NEXT:    sw s6, 152(a0)
-; RV32ZVE32F-NEXT:    sw s7, 156(a0)
-; RV32ZVE32F-NEXT:    sw s8, 128(a0)
-; RV32ZVE32F-NEXT:    sw s9, 132(a0)
-; RV32ZVE32F-NEXT:    sw s10, 136(a0)
-; RV32ZVE32F-NEXT:    sw s11, 140(a0)
-; RV32ZVE32F-NEXT:    lw a1, 140(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 112(a0)
-; RV32ZVE32F-NEXT:    lw a1, 136(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 116(a0)
-; RV32ZVE32F-NEXT:    lw a1, 132(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 120(a0)
-; RV32ZVE32F-NEXT:    lw a1, 128(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 124(a0)
-; RV32ZVE32F-NEXT:    lw a1, 156(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 96(a0)
-; RV32ZVE32F-NEXT:    lw a1, 152(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 100(a0)
-; RV32ZVE32F-NEXT:    lw a1, 148(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 104(a0)
-; RV32ZVE32F-NEXT:    lw a1, 144(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 108(a0)
-; RV32ZVE32F-NEXT:    lw a1, 172(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 80(a0)
-; RV32ZVE32F-NEXT:    lw a1, 168(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 84(a0)
-; RV32ZVE32F-NEXT:    lw a1, 164(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 88(a0)
-; RV32ZVE32F-NEXT:    lw a1, 160(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 92(a0)
-; RV32ZVE32F-NEXT:    lw a1, 188(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 64(a0)
-; RV32ZVE32F-NEXT:    lw a1, 184(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 68(a0)
-; RV32ZVE32F-NEXT:    lw a1, 180(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 72(a0)
-; RV32ZVE32F-NEXT:    lw a1, 176(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 76(a0)
-; RV32ZVE32F-NEXT:    sw a5, 240(a0)
-; RV32ZVE32F-NEXT:    sw a6, 244(a0)
-; RV32ZVE32F-NEXT:    sw t3, 248(a0)
-; RV32ZVE32F-NEXT:    sw t4, 252(a0)
-; RV32ZVE32F-NEXT:    sw t5, 224(a0)
-; RV32ZVE32F-NEXT:    sw t6, 228(a0)
-; RV32ZVE32F-NEXT:    sw s2, 232(a0)
-; RV32ZVE32F-NEXT:    sw s3, 236(a0)
-; RV32ZVE32F-NEXT:    sw ra, 208(a0)
-; RV32ZVE32F-NEXT:    lw a1, 108(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 212(a0)
-; RV32ZVE32F-NEXT:    lw a1, 104(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 216(a0)
-; RV32ZVE32F-NEXT:    lw a1, 100(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 220(a0)
-; RV32ZVE32F-NEXT:    lw a1, 116(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 192(a0)
+; RV32ZVE32F-NEXT:    lw a1, 368(sp)
+; RV32ZVE32F-NEXT:    lw a2, 372(sp)
+; RV32ZVE32F-NEXT:    lw a3, 376(sp)
+; RV32ZVE32F-NEXT:    lw a4, 380(sp)
+; RV32ZVE32F-NEXT:    lw t1, 0(a1)
+; RV32ZVE32F-NEXT:    lw t2, 4(a1)
+; RV32ZVE32F-NEXT:    lw t3, 0(a2)
+; RV32ZVE32F-NEXT:    lw t4, 4(a2)
+; RV32ZVE32F-NEXT:    lw a1, 0(a3)
+; RV32ZVE32F-NEXT:    lw a2, 4(a3)
+; RV32ZVE32F-NEXT:    lw a3, 0(a4)
+; RV32ZVE32F-NEXT:    lw a4, 4(a4)
+; RV32ZVE32F-NEXT:    sw ra, 16(a0)
+; RV32ZVE32F-NEXT:    lw ra, 172(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 20(a0)
+; RV32ZVE32F-NEXT:    lw ra, 168(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 24(a0)
+; RV32ZVE32F-NEXT:    lw ra, 164(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 28(a0)
+; RV32ZVE32F-NEXT:    lw ra, 236(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 0(a0)
+; RV32ZVE32F-NEXT:    lw ra, 232(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 4(a0)
+; RV32ZVE32F-NEXT:    lw ra, 196(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 8(a0)
+; RV32ZVE32F-NEXT:    lw ra, 192(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 12(a0)
+; RV32ZVE32F-NEXT:    lw ra, 188(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 48(a0)
+; RV32ZVE32F-NEXT:    lw ra, 184(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 52(a0)
+; RV32ZVE32F-NEXT:    lw ra, 180(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 56(a0)
+; RV32ZVE32F-NEXT:    lw ra, 176(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw ra, 60(a0)
+; RV32ZVE32F-NEXT:    sw a5, 176(a0)
+; RV32ZVE32F-NEXT:    sw a6, 180(a0)
+; RV32ZVE32F-NEXT:    sw a7, 184(a0)
+; RV32ZVE32F-NEXT:    sw t0, 188(a0)
+; RV32ZVE32F-NEXT:    sw t5, 160(a0)
+; RV32ZVE32F-NEXT:    sw t6, 164(a0)
+; RV32ZVE32F-NEXT:    sw s2, 168(a0)
+; RV32ZVE32F-NEXT:    sw s3, 172(a0)
+; RV32ZVE32F-NEXT:    sw s8, 144(a0)
+; RV32ZVE32F-NEXT:    sw s9, 148(a0)
+; RV32ZVE32F-NEXT:    sw s10, 152(a0)
+; RV32ZVE32F-NEXT:    sw s11, 156(a0)
+; RV32ZVE32F-NEXT:    lw a5, 128(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 128(a0)
+; RV32ZVE32F-NEXT:    lw a5, 124(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 132(a0)
+; RV32ZVE32F-NEXT:    lw a5, 120(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 136(a0)
+; RV32ZVE32F-NEXT:    lw a5, 116(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 140(a0)
+; RV32ZVE32F-NEXT:    lw a5, 144(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 112(a0)
+; RV32ZVE32F-NEXT:    lw a5, 140(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 116(a0)
+; RV32ZVE32F-NEXT:    lw a5, 136(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 120(a0)
+; RV32ZVE32F-NEXT:    lw a5, 132(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 124(a0)
+; RV32ZVE32F-NEXT:    lw a5, 160(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 96(a0)
+; RV32ZVE32F-NEXT:    lw a5, 156(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 100(a0)
+; RV32ZVE32F-NEXT:    lw a5, 152(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 104(a0)
+; RV32ZVE32F-NEXT:    lw a5, 148(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 108(a0)
+; RV32ZVE32F-NEXT:    lw a5, 212(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 80(a0)
+; RV32ZVE32F-NEXT:    lw a5, 208(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 84(a0)
+; RV32ZVE32F-NEXT:    lw a5, 204(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 88(a0)
+; RV32ZVE32F-NEXT:    lw a5, 200(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 92(a0)
+; RV32ZVE32F-NEXT:    lw a5, 228(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 64(a0)
+; RV32ZVE32F-NEXT:    lw a5, 224(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 68(a0)
+; RV32ZVE32F-NEXT:    lw a5, 220(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 72(a0)
+; RV32ZVE32F-NEXT:    lw a5, 216(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a5, 76(a0)
+; RV32ZVE32F-NEXT:    sw a1, 240(a0)
+; RV32ZVE32F-NEXT:    sw a2, 244(a0)
+; RV32ZVE32F-NEXT:    sw a3, 248(a0)
+; RV32ZVE32F-NEXT:    sw a4, 252(a0)
+; RV32ZVE32F-NEXT:    sw t1, 224(a0)
+; RV32ZVE32F-NEXT:    sw t2, 228(a0)
+; RV32ZVE32F-NEXT:    sw t3, 232(a0)
+; RV32ZVE32F-NEXT:    sw t4, 236(a0)
+; RV32ZVE32F-NEXT:    sw s4, 208(a0)
+; RV32ZVE32F-NEXT:    sw s5, 212(a0)
+; RV32ZVE32F-NEXT:    sw s6, 216(a0)
+; RV32ZVE32F-NEXT:    sw s7, 220(a0)
 ; RV32ZVE32F-NEXT:    lw a1, 112(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    sw a1, 192(a0)
+; RV32ZVE32F-NEXT:    lw a1, 108(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    sw a1, 196(a0)
-; RV32ZVE32F-NEXT:    lw a1, 124(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw a1, 104(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    sw a1, 200(a0)
-; RV32ZVE32F-NEXT:    lw a1, 120(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw a1, 100(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    sw a1, 204(a0)
-; RV32ZVE32F-NEXT:    lw a1, 220(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 16(a0)
-; RV32ZVE32F-NEXT:    lw a1, 216(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 20(a0)
-; RV32ZVE32F-NEXT:    lw a1, 212(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 24(a0)
-; RV32ZVE32F-NEXT:    lw a1, 208(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 28(a0)
 ; RV32ZVE32F-NEXT:    lw a1, 252(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 0(a0)
-; RV32ZVE32F-NEXT:    lw a1, 248(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 4(a0)
-; RV32ZVE32F-NEXT:    lw a1, 244(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 8(a0)
-; RV32ZVE32F-NEXT:    lw a1, 240(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 12(a0)
-; RV32ZVE32F-NEXT:    lw a1, 204(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 48(a0)
-; RV32ZVE32F-NEXT:    lw a1, 200(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 52(a0)
-; RV32ZVE32F-NEXT:    lw a1, 196(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 56(a0)
-; RV32ZVE32F-NEXT:    lw a1, 192(sp) # 4-byte Folded Reload
-; RV32ZVE32F-NEXT:    sw a1, 60(a0)
-; RV32ZVE32F-NEXT:    lw a1, 236(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    sw a1, 32(a0)
-; RV32ZVE32F-NEXT:    lw a1, 232(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw a1, 248(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    sw a1, 36(a0)
-; RV32ZVE32F-NEXT:    lw a1, 228(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw a1, 244(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    sw a1, 40(a0)
-; RV32ZVE32F-NEXT:    lw a1, 224(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw a1, 240(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    sw a1, 44(a0)
 ; RV32ZVE32F-NEXT:    addi sp, s0, -512
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa sp, 512

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
index dbbec96445e3ea..f72b08a405246e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
@@ -278,12 +278,12 @@ define <64 x float> @masked_load_v64f32(ptr %a, <64 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v64f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v16, v0, 4
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 4
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vle32.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   %load = call <64 x float> @llvm.masked.load.v64f32(ptr %a, i32 8, <64 x i1> %mask, <64 x float> undef)
@@ -294,12 +294,12 @@ define <128 x bfloat> @masked_load_v128bf16(ptr %a, <128 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v128bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v16, v0, 8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 8
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vle16.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   %load = call <128 x bfloat> @llvm.masked.load.v128bf16(ptr %a, i32 8, <128 x i1> %mask, <128 x bfloat> undef)
@@ -310,12 +310,12 @@ define <128 x half> @masked_load_v128f16(ptr %a, <128 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v128f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v16, v0, 8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 8
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vle16.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   %load = call <128 x half> @llvm.masked.load.v128f16(ptr %a, i32 8, <128 x i1> %mask, <128 x half> undef)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
index 4f3313f3760bee..e0cf39c75da240 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
@@ -276,12 +276,12 @@ define <64 x i32> @masked_load_v64i32(ptr %a, <64 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v64i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v16, v0, 4
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 4
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vle32.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   %load = call <64 x i32> @llvm.masked.load.v64i32(ptr %a, i32 8, <64 x i1> %mask, <64 x i32> undef)
@@ -303,12 +303,12 @@ define <128 x i16> @masked_load_v128i16(ptr %a, <128 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v128i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v16, v0, 8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 8
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vle16.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   %load = call <128 x i16> @llvm.masked.load.v128i16(ptr %a, i32 8, <128 x i1> %mask, <128 x i16> undef)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index 476d023b9ad6ff..575a757149ebba 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -349,12 +349,12 @@ define void @mscatter_truemask_v4i8(<4 x i8> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-NEXT:    ld a3, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT:    vse8.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT:    vse8.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vse8.v v9, (a2)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT:    vse8.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
+; RV64ZVE32F-NEXT:    vse8.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vse8.v v8, (a0)
 ; RV64ZVE32F-NEXT:    ret
   call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> splat (i1 1))
@@ -867,12 +867,12 @@ define void @mscatter_truemask_v4i16(<4 x i16> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-NEXT:    ld a3, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT:    vse16.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT:    vse16.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a2)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT:    vse16.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
+; RV64ZVE32F-NEXT:    vse16.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vse16.v v8, (a0)
 ; RV64ZVE32F-NEXT:    ret
   call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %val, <4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1))
@@ -1744,12 +1744,12 @@ define void @mscatter_truemask_v4i32(<4 x i32> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-NEXT:    ld a3, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vse32.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT:    vse32.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vse32.v v9, (a2)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT:    vse32.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
+; RV64ZVE32F-NEXT:    vse32.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vse32.v v8, (a0)
 ; RV64ZVE32F-NEXT:    ret
   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 1))
@@ -3097,20 +3097,20 @@ define void @mscatter_truemask_v4i64(<4 x i64> %val, <4 x ptr> %ptrs) {
 ; RV32ZVE32F-NEXT:    lw a0, 12(a0)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s t0, v8
-; RV32ZVE32F-NEXT:    sw a5, 0(t0)
-; RV32ZVE32F-NEXT:    sw a6, 4(t0)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
-; RV32ZVE32F-NEXT:    vmv.x.s a5, v9
-; RV32ZVE32F-NEXT:    sw a7, 0(a5)
-; RV32ZVE32F-NEXT:    sw a0, 4(a5)
+; RV32ZVE32F-NEXT:    vmv.x.s t1, v9
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v9
-; RV32ZVE32F-NEXT:    sw a1, 0(a0)
-; RV32ZVE32F-NEXT:    sw a2, 4(a0)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    sw a3, 0(a0)
-; RV32ZVE32F-NEXT:    sw a4, 4(a0)
+; RV32ZVE32F-NEXT:    sw a5, 0(t0)
+; RV32ZVE32F-NEXT:    sw a6, 4(t0)
+; RV32ZVE32F-NEXT:    vmv.x.s a5, v9
+; RV32ZVE32F-NEXT:    vmv.x.s a6, v8
+; RV32ZVE32F-NEXT:    sw a7, 0(t1)
+; RV32ZVE32F-NEXT:    sw a0, 4(t1)
+; RV32ZVE32F-NEXT:    sw a1, 0(a5)
+; RV32ZVE32F-NEXT:    sw a2, 4(a5)
+; RV32ZVE32F-NEXT:    sw a3, 0(a6)
+; RV32ZVE32F-NEXT:    sw a4, 4(a6)
 ; RV32ZVE32F-NEXT:    ret
 ;
 ; RV64ZVE32F-LABEL: mscatter_truemask_v4i64:
@@ -5693,6 +5693,7 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
+; RV32ZVE32F-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
 ; RV32ZVE32F-NEXT:    .cfi_offset s0, -4
 ; RV32ZVE32F-NEXT:    .cfi_offset s1, -8
 ; RV32ZVE32F-NEXT:    .cfi_offset s2, -12
@@ -5702,6 +5703,7 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    .cfi_offset s6, -28
 ; RV32ZVE32F-NEXT:    .cfi_offset s7, -32
 ; RV32ZVE32F-NEXT:    .cfi_offset s8, -36
+; RV32ZVE32F-NEXT:    .cfi_offset s9, -40
 ; RV32ZVE32F-NEXT:    .cfi_remember_state
 ; RV32ZVE32F-NEXT:    lw a3, 56(a0)
 ; RV32ZVE32F-NEXT:    lw a4, 60(a0)
@@ -5724,21 +5726,21 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    lw s6, 0(a2)
 ; RV32ZVE32F-NEXT:    lw s7, 8(a2)
 ; RV32ZVE32F-NEXT:    lw s8, 16(a2)
-; RV32ZVE32F-NEXT:    lw a2, 24(a2)
+; RV32ZVE32F-NEXT:    lw s9, 24(a2)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.v.x v8, s6
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s7
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s8
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s9
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s3
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s4
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, s5
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a2, v0
 ; RV32ZVE32F-NEXT:    andi s2, a2, 1
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a1
 ; RV32ZVE32F-NEXT:    bnez s2, .LBB51_10
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -5778,6 +5780,7 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
+; RV32ZVE32F-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
 ; RV32ZVE32F-NEXT:    .cfi_restore s0
 ; RV32ZVE32F-NEXT:    .cfi_restore s1
 ; RV32ZVE32F-NEXT:    .cfi_restore s2
@@ -5787,6 +5790,7 @@ define void @mscatter_baseidx_v8i64(<8 x i64> %val, ptr %base, <8 x i64> %idxs,
 ; RV32ZVE32F-NEXT:    .cfi_restore s6
 ; RV32ZVE32F-NEXT:    .cfi_restore s7
 ; RV32ZVE32F-NEXT:    .cfi_restore s8
+; RV32ZVE32F-NEXT:    .cfi_restore s9
 ; RV32ZVE32F-NEXT:    addi sp, sp, 48
 ; RV32ZVE32F-NEXT:    .cfi_def_cfa_offset 0
 ; RV32ZVE32F-NEXT:    ret
@@ -6146,19 +6150,19 @@ define void @mscatter_truemask_v4bf16(<4 x bfloat> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a4, v8
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a4
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v9
+; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a1)
-; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v9
-; RV64ZVE32F-NEXT:    fmv.h.x fa5, a1
+; RV64ZVE32F-NEXT:    fmv.h.x fa5, a4
+; RV64ZVE32F-NEXT:    vmv.x.s a4, v8
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a3)
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
-; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
-; RV64ZVE32F-NEXT:    fmv.h.x fa5, a1
+; RV64ZVE32F-NEXT:    fmv.h.x fa5, a4
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a0)
 ; RV64ZVE32F-NEXT:    ret
   call void @llvm.masked.scatter.v4bf16.v4p0(<4 x bfloat> %val, <4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1))
@@ -6318,10 +6322,10 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB58_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB58_2: # %else
@@ -6331,11 +6335,11 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB58_4: # %else2
@@ -6358,11 +6362,11 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v10, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB58_9: # %else10
@@ -6377,11 +6381,11 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:    ret
 ; RV64ZVE32F-NEXT:  .LBB58_12: # %cond.store3
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v8, 2
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -6390,11 +6394,11 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 16
@@ -6402,11 +6406,11 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:  .LBB58_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 32
@@ -6414,11 +6418,11 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:    j .LBB58_9
 ; RV64ZVE32F-NEXT:  .LBB58_15: # %cond.store11
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -6426,11 +6430,13 @@ define void @mscatter_baseidx_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8>
 ; RV64ZVE32F-NEXT:  .LBB58_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
-; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a0)
@@ -6467,10 +6473,10 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB59_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB59_2: # %else
@@ -6480,11 +6486,11 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB59_4: # %else2
@@ -6507,11 +6513,11 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v10, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB59_9: # %else10
@@ -6526,11 +6532,11 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    ret
 ; RV64ZVE32F-NEXT:  .LBB59_12: # %cond.store3
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v8, 2
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -6539,11 +6545,11 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 16
@@ -6551,11 +6557,11 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:  .LBB59_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 32
@@ -6563,11 +6569,11 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    j .LBB59_9
 ; RV64ZVE32F-NEXT:  .LBB59_15: # %cond.store11
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -6575,11 +6581,13 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:  .LBB59_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
-; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a0)
@@ -6615,11 +6623,11 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB60_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
 ; RV64ZVE32F-NEXT:    andi a2, a2, 255
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
-; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB60_2: # %else
@@ -6629,12 +6637,12 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    andi a2, a2, 255
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB60_4: # %else2
@@ -6657,12 +6665,12 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v10, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    andi a2, a2, 255
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB60_9: # %else10
@@ -6677,12 +6685,12 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    ret
 ; RV64ZVE32F-NEXT:  .LBB60_12: # %cond.store3
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v8, 2
+; RV64ZVE32F-NEXT:    andi a2, a2, 255
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -6691,12 +6699,12 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    andi a2, a2, 255
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 16
@@ -6704,12 +6712,12 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:  .LBB60_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
+; RV64ZVE32F-NEXT:    andi a2, a2, 255
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 32
@@ -6717,12 +6725,12 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:    j .LBB60_9
 ; RV64ZVE32F-NEXT:  .LBB60_15: # %cond.store11
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
+; RV64ZVE32F-NEXT:    andi a2, a2, 255
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -6730,12 +6738,14 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8
 ; RV64ZVE32F-NEXT:  .LBB60_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-NEXT:    andi a1, a1, 255
 ; RV64ZVE32F-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
-; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a0)
@@ -6772,9 +6782,9 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
-; RV64ZVE32F-NEXT:    vmv.x.s a3, v8
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB61_2: # %else
@@ -6784,11 +6794,11 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB61_4: # %else2
@@ -6811,11 +6821,11 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v10, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:  .LBB61_9: # %else10
@@ -6830,11 +6840,11 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id
 ; RV64ZVE32F-NEXT:    ret
 ; RV64ZVE32F-NEXT:  .LBB61_12: # %cond.store3
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v11, v8, 2
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
@@ -6843,11 +6853,11 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 16
@@ -6855,10 +6865,10 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id
 ; RV64ZVE32F-NEXT:  .LBB61_14: # %cond.store7
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 4
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a2, a1, 32
@@ -6866,11 +6876,11 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id
 ; RV64ZVE32F-NEXT:    j .LBB61_9
 ; RV64ZVE32F-NEXT:  .LBB61_15: # %cond.store11
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v10, v8, 6
+; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
@@ -6878,11 +6888,11 @@ define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %id
 ; RV64ZVE32F-NEXT:  .LBB61_16: # %cond.store13
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
-; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
 ; RV64ZVE32F-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-NEXT:    fsh fa5, 0(a0)
@@ -7146,12 +7156,12 @@ define void @mscatter_truemask_v4f16(<4 x half> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-ZVFH-NEXT:    ld a3, 16(a0)
 ; RV64ZVE32F-ZVFH-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-ZVFH-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-ZVFH-NEXT:    vse16.v v8, (a1)
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v9, v8, 1
+; RV64ZVE32F-ZVFH-NEXT:    vse16.v v8, (a1)
 ; RV64ZVE32F-ZVFH-NEXT:    vse16.v v9, (a2)
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-ZVFH-NEXT:    vse16.v v9, (a3)
 ; RV64ZVE32F-ZVFH-NEXT:    vslidedown.vi v8, v8, 3
+; RV64ZVE32F-ZVFH-NEXT:    vse16.v v9, (a3)
 ; RV64ZVE32F-ZVFH-NEXT:    vse16.v v8, (a0)
 ; RV64ZVE32F-ZVFH-NEXT:    ret
 ;
@@ -7163,19 +7173,19 @@ define void @mscatter_truemask_v4f16(<4 x half> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-ZVFHMIN-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a4, v8
+; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a4
+; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a4, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a1)
-; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a1
+; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a4
+; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a4, v8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
-; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a3)
-; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 3
-; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v8
-; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a1
+; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a4
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a0)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    ret
   call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> %val, <4 x ptr> %ptrs, i32 2, <4 x i1> splat (i1 1))
@@ -7529,10 +7539,10 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    beqz a2, .LBB68_2
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v8
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB68_2: # %else
@@ -7542,11 +7552,11 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v9, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB68_4: # %else2
@@ -7569,11 +7579,11 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v10, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB68_9: # %else10
@@ -7588,11 +7598,11 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    ret
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB68_12: # %cond.store3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v11, v8, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 8
@@ -7601,11 +7611,11 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 16
@@ -7613,11 +7623,11 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB68_14: # %cond.store7
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 4
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 32
@@ -7625,11 +7635,11 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    j .LBB68_9
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB68_15: # %cond.store11
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 6
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a1, a1, -128
@@ -7637,11 +7647,13 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB68_16: # %cond.store13
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-ZVFHMIN-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    add a0, a0, a1
-; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a0)
@@ -7788,10 +7800,10 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    beqz a2, .LBB69_2
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v8
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB69_2: # %else
@@ -7801,11 +7813,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v9, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB69_4: # %else2
@@ -7828,11 +7840,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v10, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB69_9: # %else10
@@ -7847,11 +7859,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    ret
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB69_12: # %cond.store3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v11, v8, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 8
@@ -7860,11 +7872,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 16
@@ -7872,11 +7884,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB69_14: # %cond.store7
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 4
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 32
@@ -7884,11 +7896,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    j .LBB69_9
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB69_15: # %cond.store11
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 6
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a1, a1, -128
@@ -7896,11 +7908,13 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB69_16: # %cond.store13
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-ZVFHMIN-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    add a0, a0, a1
-; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a0)
@@ -8054,11 +8068,11 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    beqz a2, .LBB70_2
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
 ; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
-; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB70_2: # %else
@@ -8068,12 +8082,12 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v9, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB70_4: # %else2
@@ -8096,12 +8110,12 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v10, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB70_9: # %else10
@@ -8116,12 +8130,12 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    ret
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB70_12: # %cond.store3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v11, v8, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 8
@@ -8130,12 +8144,12 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 16
@@ -8143,12 +8157,12 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB70_14: # %cond.store7
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 4
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 32
@@ -8156,12 +8170,12 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:    j .LBB70_9
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB70_15: # %cond.store11
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 6
+; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a2, 255
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a1, a1, -128
@@ -8169,12 +8183,14 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB70_16: # %cond.store13
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e8, mf4, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a1, a1, 255
 ; RV64ZVE32F-ZVFHMIN-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    add a0, a0, a1
-; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a0)
@@ -8320,9 +8336,9 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFHMIN-NEXT:  # %bb.1: # %cond.store
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
-; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB71_2: # %else
@@ -8332,11 +8348,11 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v9, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB71_4: # %else2
@@ -8359,11 +8375,11 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v10, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB71_9: # %else10
@@ -8378,11 +8394,11 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFHMIN-NEXT:    ret
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB71_12: # %cond.store3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v11, v8, 2
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v11
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 8
@@ -8391,11 +8407,11 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 16
@@ -8403,10 +8419,10 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB71_14: # %cond.store7
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v10
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v8, 4
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v9
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a2, a1, 32
@@ -8414,11 +8430,11 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFHMIN-NEXT:    j .LBB71_9
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB71_15: # %cond.store11
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a2, v9
-; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
-; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v10, v8, 6
+; RV64ZVE32F-ZVFHMIN-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a3, v10
+; RV64ZVE32F-ZVFHMIN-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a3
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a2)
 ; RV64ZVE32F-ZVFHMIN-NEXT:    andi a1, a1, -128
@@ -8426,11 +8442,11 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
 ; RV64ZVE32F-ZVFHMIN-NEXT:  .LBB71_16: # %cond.store13
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v9, v9, 1
+; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v9
 ; RV64ZVE32F-ZVFHMIN-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    add a0, a0, a1
-; RV64ZVE32F-ZVFHMIN-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-ZVFHMIN-NEXT:    vslidedown.vi v8, v8, 7
 ; RV64ZVE32F-ZVFHMIN-NEXT:    vmv.x.s a1, v8
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fmv.h.x fa5, a1
 ; RV64ZVE32F-ZVFHMIN-NEXT:    fsh fa5, 0(a0)
@@ -8603,12 +8619,12 @@ define void @mscatter_truemask_v4f32(<4 x float> %val, <4 x ptr> %ptrs) {
 ; RV64ZVE32F-NEXT:    ld a3, 16(a0)
 ; RV64ZVE32F-NEXT:    ld a0, 24(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; RV64ZVE32F-NEXT:    vse32.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT:    vse32.v v8, (a1)
 ; RV64ZVE32F-NEXT:    vse32.v v9, (a2)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
-; RV64ZVE32F-NEXT:    vse32.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
+; RV64ZVE32F-NEXT:    vse32.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vse32.v v8, (a0)
 ; RV64ZVE32F-NEXT:    ret
   call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %val, <4 x ptr> %ptrs, i32 4, <4 x i1> splat (i1 1))
@@ -9925,14 +9941,14 @@ define void @mscatter_truemask_v4f64(<4 x double> %val, <4 x ptr> %ptrs) {
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
-; RV32ZVE32F-NEXT:    fsd fa0, 0(a0)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
+; RV32ZVE32F-NEXT:    fsd fa0, 0(a0)
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v9
-; RV32ZVE32F-NEXT:    fsd fa1, 0(a0)
 ; RV32ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
+; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
+; RV32ZVE32F-NEXT:    fsd fa1, 0(a0)
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v9
 ; RV32ZVE32F-NEXT:    fsd fa2, 0(a0)
-; RV32ZVE32F-NEXT:    vslidedown.vi v8, v8, 3
 ; RV32ZVE32F-NEXT:    vmv.x.s a0, v8
 ; RV32ZVE32F-NEXT:    fsd fa3, 0(a0)
 ; RV32ZVE32F-NEXT:    ret
@@ -10153,11 +10169,11 @@ define void @mscatter_baseidx_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x i8>
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB91_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -10353,11 +10369,11 @@ define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB92_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -10555,11 +10571,11 @@ define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf4 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB93_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -10764,11 +10780,11 @@ define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB94_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -10965,11 +10981,11 @@ define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vsext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB95_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -11168,11 +11184,11 @@ define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, ptr %base, <8
 ; RV32ZVE32F:       # %bb.0:
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vzext.vf2 v10, v8
-; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
-; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32ZVE32F-NEXT:    vsll.vi v8, v10, 3
+; RV32ZVE32F-NEXT:    andi a2, a1, 1
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB96_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -11991,21 +12007,21 @@ define void @mscatter_baseidx_v8f64(<8 x double> %val, ptr %base, <8 x i64> %idx
 ; RV32ZVE32F-NEXT:    lw a6, 0(a1)
 ; RV32ZVE32F-NEXT:    lw a7, 8(a1)
 ; RV32ZVE32F-NEXT:    lw t0, 16(a1)
-; RV32ZVE32F-NEXT:    lw a1, 24(a1)
+; RV32ZVE32F-NEXT:    lw t1, 24(a1)
 ; RV32ZVE32F-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.v.x v8, a6
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
+; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a7
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t0
-; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a1
+; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, t1
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a2
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a4
 ; RV32ZVE32F-NEXT:    vslide1down.vx v8, v8, a5
 ; RV32ZVE32F-NEXT:    vsll.vi v8, v8, 3
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; RV32ZVE32F-NEXT:    vmv.x.s a1, v0
 ; RV32ZVE32F-NEXT:    andi a2, a1, 1
-; RV32ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32ZVE32F-NEXT:    vadd.vx v8, v8, a0
 ; RV32ZVE32F-NEXT:    bnez a2, .LBB100_9
 ; RV32ZVE32F-NEXT:  # %bb.1: # %else
@@ -12902,8 +12918,8 @@ define void @mscatter_shuffle_rotate(<8 x i16> %val, ptr %base) {
 ; RV64ZVE32F-NEXT:    addi a6, a0, 10
 ; RV64ZVE32F-NEXT:    addi a7, a0, 8
 ; RV64ZVE32F-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT:    vse16.v v8, (a7)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT:    vse16.v v8, (a7)
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a6)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a5)
@@ -12914,8 +12930,8 @@ define void @mscatter_shuffle_rotate(<8 x i16> %val, ptr %base) {
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:    vse16.v v9, (a3)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v9, v8, 6
-; RV64ZVE32F-NEXT:    vse16.v v9, (a2)
 ; RV64ZVE32F-NEXT:    vslidedown.vi v8, v8, 7
+; RV64ZVE32F-NEXT:    vse16.v v9, (a2)
 ; RV64ZVE32F-NEXT:    vse16.v v8, (a1)
 ; RV64ZVE32F-NEXT:    ret
   %ptrs = getelementptr inbounds i16, ptr %base, <8 x i64>  <i64 4, i64 5, i64 6, i64 7, i64 0, i64 1, i64 2, i64 3>

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
index f7e311d06c03a1..ed6ec4d5659b15 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll
@@ -278,12 +278,12 @@ define void @masked_store_v64f32(<64 x float> %val, ptr %a, <64 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v64f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 4
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 4
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vse32.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v64f32.p0(<64 x float> %val, ptr %a, i32 8, <64 x i1> %mask)
@@ -294,12 +294,12 @@ define void @masked_store_v128bf16(<128 x bfloat> %val, ptr %a, <128 x i1> %mask
 ; CHECK-LABEL: masked_store_v128bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vse16.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 8
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vse16.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v128bf16.p0(<128 x bfloat> %val, ptr %a, i32 8, <128 x i1> %mask)
@@ -310,12 +310,12 @@ define void @masked_store_v128f16(<128 x half> %val, ptr %a, <128 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v128f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vse16.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 8
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vse16.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v128f16.p0(<128 x half> %val, ptr %a, i32 8, <128 x i1> %mask)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
index 0c9bf9a09fd6d3..c3b10db115bae5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll
@@ -276,12 +276,12 @@ define void @masked_store_v64i32(<64 x i32> %val, ptr %a, <64 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v64i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 4
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 4
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vse32.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v64i32.p0(<64 x i32> %val, ptr %a, i32 8, <64 x i1> %mask)
@@ -303,12 +303,12 @@ define void @masked_store_v128i16(<128 x i16> %val, ptr %a, <128 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v128i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vse16.v v8, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 8
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vse16.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v128i16.p0(<128 x i16> %val, ptr %a, i32 8, <128 x i1> %mask)
@@ -321,10 +321,10 @@ define void @masked_store_v256i8(<256 x i8> %val, ptr %a, <256 x i1> %mask) {
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v24, (a1)
+; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vse8.v v8, (a0), v0.t
-; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vse8.v v16, (a0), v0.t
+; CHECK-NEXT:    vse8.v v16, (a1), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v256i8.p0(<256 x i8> %val, ptr %a, i32 8, <256 x i1> %mask)
   ret void

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
index 3fab9ce6367868..46c2033d28b387 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
@@ -135,9 +135,9 @@ declare <16 x half> @llvm.vp.nearbyint.v16f16(<16 x half>, <16 x i1>, i32)
 define <16 x half> @vp_nearbyint_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v16f16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI6_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -393,9 +393,9 @@ declare <4 x double> @llvm.vp.nearbyint.v4f64(<4 x double>, <4 x i1>, i32)
 define <4 x double> @vp_nearbyint_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v4f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a1)
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -437,9 +437,9 @@ declare <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double>, <8 x i1>, i32)
 define <8 x double> @vp_nearbyint_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v8f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a1)
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
@@ -481,9 +481,9 @@ declare <15 x double> @llvm.vp.nearbyint.v15f64(<15 x double>, <15 x i1>, i32)
 define <15 x double> @vp_nearbyint_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v15f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -525,9 +525,9 @@ declare <16 x double> @llvm.vp.nearbyint.v16f64(<16 x double>, <16 x i1>, i32)
 define <16 x double> @vp_nearbyint_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_v16f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -578,9 +578,15 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    sub sp, sp, a2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
-; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -589,14 +595,18 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a2, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a2
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    fsflags a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
@@ -610,6 +620,12 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    fsflags a0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
@@ -624,30 +640,33 @@ define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB27_2:
-; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
+; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
+; CHECK-NEXT:    addi a2, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a2
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    frflags a2
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    frflags a1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a1
+; CHECK-NEXT:    fsflags a2
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    frflags a1
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    fsflags a0
+; CHECK-NEXT:    fsflags a1
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double> %va, <32 x i1> splat (i1 true), i32 %evl)
   ret <32 x double> %v

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
index a8798474d669ae..4f0f5dd78c94b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll
@@ -437,8 +437,8 @@ define i32 @reduce_sum_4xi32_reduce_order(<4 x i32> %v) {
 ; RV32-NEXT:    vslidedown.vi v9, v8, 1
 ; RV32-NEXT:    vmv.x.s a1, v9
 ; RV32-NEXT:    vslidedown.vi v9, v8, 2
-; RV32-NEXT:    vmv.x.s a2, v9
 ; RV32-NEXT:    vslidedown.vi v8, v8, 3
+; RV32-NEXT:    vmv.x.s a2, v9
 ; RV32-NEXT:    vmv.x.s a3, v8
 ; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    add a0, a0, a3
@@ -452,8 +452,8 @@ define i32 @reduce_sum_4xi32_reduce_order(<4 x i32> %v) {
 ; RV64-NEXT:    vslidedown.vi v9, v8, 1
 ; RV64-NEXT:    vmv.x.s a1, v9
 ; RV64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-NEXT:    vmv.x.s a2, v9
 ; RV64-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-NEXT:    vmv.x.s a2, v9
 ; RV64-NEXT:    vmv.x.s a3, v8
 ; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    add a0, a0, a3
@@ -856,13 +856,13 @@ define float @reduce_fadd_4xi32_non_associative(ptr %p) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vslidedown.vi v9, v8, 3
-; CHECK-NEXT:    vfmv.f.s fa5, v9
 ; CHECK-NEXT:    lui a0, 524288
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vsetivli zero, 3, e32, m1, ta, ma
-; CHECK-NEXT:    vfredusum.vs v8, v8, v9
-; CHECK-NEXT:    vfmv.f.s fa4, v8
+; CHECK-NEXT:    vfredusum.vs v9, v8, v9
+; CHECK-NEXT:    vslidedown.vi v8, v8, 3
+; CHECK-NEXT:    vfmv.f.s fa5, v8
+; CHECK-NEXT:    vfmv.f.s fa4, v9
 ; CHECK-NEXT:    fadd.s fa0, fa4, fa5
 ; CHECK-NEXT:    ret
   %v = load <4 x float>, ptr %p, align 256
@@ -887,8 +887,8 @@ define float @reduce_fadd_4xi32_non_associative2(ptr %p) {
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 1
 ; CHECK-NEXT:    vfmv.f.s fa4, v9
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 2
-; CHECK-NEXT:    vfmv.f.s fa3, v9
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 3
+; CHECK-NEXT:    vfmv.f.s fa3, v9
 ; CHECK-NEXT:    vfmv.f.s fa2, v8
 ; CHECK-NEXT:    fadd.s fa5, fa5, fa4
 ; CHECK-NEXT:    fadd.s fa4, fa3, fa2

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index 487234674befe0..8bf30f8f0d072b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -797,13 +797,13 @@ define float @vreduce_ord_fwadd_v64f32(ptr %x, float %s) {
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vx v16, v8, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfmv.s.f v12, fa0
+; CHECK-NEXT:    vfmv.s.f v16, fa0
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vslidedown.vx v24, v8, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwredosum.vs v8, v8, v12
-; CHECK-NEXT:    vfwredosum.vs v8, v16, v8
+; CHECK-NEXT:    vfwredosum.vs v8, v8, v16
+; CHECK-NEXT:    vfwredosum.vs v8, v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -1194,13 +1194,13 @@ define double @vreduce_ord_fwadd_v32f64(ptr %x, double %s) {
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v8, 16
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m1, ta, ma
-; CHECK-NEXT:    vfmv.s.f v12, fa0
+; CHECK-NEXT:    vfmv.s.f v16, fa0
+; CHECK-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v8, 16
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vfwredosum.vs v8, v8, v12
-; CHECK-NEXT:    vfwredosum.vs v8, v16, v8
+; CHECK-NEXT:    vfwredosum.vs v8, v8, v16
+; CHECK-NEXT:    vfwredosum.vs v8, v24, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -1364,17 +1364,17 @@ define float @vreduce_fmin_v128f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fmin_v128f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    addi a2, a0, 384
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vle32.v v16, (a1)
+; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    addi a1, a0, 256
+; CHECK-NEXT:    vle32.v v16, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle32.v v24, (a0)
 ; CHECK-NEXT:    vle32.v v0, (a1)
-; CHECK-NEXT:    vfmin.vv v16, v24, v16
-; CHECK-NEXT:    vfmin.vv v8, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vfmin.vv v8, v24, v8
+; CHECK-NEXT:    vfmin.vv v16, v16, v0
+; CHECK-NEXT:    vfmin.vv v8, v16, v8
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -1611,17 +1611,17 @@ define float @vreduce_fmax_v128f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fmax_v128f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    addi a2, a0, 384
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vle32.v v16, (a1)
+; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    addi a1, a0, 256
+; CHECK-NEXT:    vle32.v v16, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle32.v v24, (a0)
 ; CHECK-NEXT:    vle32.v v0, (a1)
-; CHECK-NEXT:    vfmax.vv v16, v24, v16
-; CHECK-NEXT:    vfmax.vv v8, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vfmax.vv v8, v24, v8
+; CHECK-NEXT:    vfmax.vv v16, v16, v0
+; CHECK-NEXT:    vfmax.vv v8, v16, v8
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2013,80 +2013,61 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    addi a2, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v16, (a2)
+; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vle32.v v8, (a1)
+; CHECK-NEXT:    vle32.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vle32.v v24, (a1)
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v16, v8
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vfmin.vv v16, v8, v16
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v24, v24, v8
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
+; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmin.vv v8, v8, v16
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
@@ -2101,10 +2082,7 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:  .LBB121_3:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -2119,17 +2097,17 @@ define float @vreduce_fminimum_v128f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v128f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    addi a2, a0, 384
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vle32.v v16, (a1)
+; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    addi a1, a0, 256
+; CHECK-NEXT:    vle32.v v16, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle32.v v24, (a0)
 ; CHECK-NEXT:    vle32.v v0, (a1)
-; CHECK-NEXT:    vfmin.vv v16, v24, v16
-; CHECK-NEXT:    vfmin.vv v8, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vfmin.vv v8, v24, v8
+; CHECK-NEXT:    vfmin.vv v16, v16, v0
+; CHECK-NEXT:    vfmin.vv v8, v16, v8
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -2353,79 +2331,60 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    addi a1, a0, 384
 ; CHECK-NEXT:    vle64.v v8, (a1)
+; CHECK-NEXT:    addi a1, a0, 384
+; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vle64.v v24, (a1)
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v16, v8
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vfmin.vv v16, v8, v16
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v24, v24, v8
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
+; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmin.vv v8, v8, v16
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
@@ -2440,10 +2399,7 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:  .LBB133_3:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -2459,9 +2415,9 @@ define double @vreduce_fminimum_v64f64_nonans(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    addi a1, a0, 384
+; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    addi a2, a0, 384
-; CHECK-NEXT:    vle64.v v16, (a2)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle64.v v24, (a0)
 ; CHECK-NEXT:    vle64.v v0, (a1)
@@ -2771,80 +2727,61 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    addi a2, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v16, (a2)
+; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vle32.v v8, (a1)
+; CHECK-NEXT:    vle32.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vle32.v v24, (a1)
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v16, v8
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vfmax.vv v16, v8, v16
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v24, v24, v8
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
+; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmax.vv v8, v8, v16
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
@@ -2859,10 +2796,7 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:  .LBB149_3:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -2877,17 +2811,17 @@ define float @vreduce_fmaximum_v128f32_nonans(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v128f32_nonans:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    addi a2, a0, 384
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    addi a1, a0, 384
-; CHECK-NEXT:    vle32.v v16, (a1)
+; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    addi a1, a0, 256
+; CHECK-NEXT:    vle32.v v16, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle32.v v24, (a0)
 ; CHECK-NEXT:    vle32.v v0, (a1)
-; CHECK-NEXT:    vfmax.vv v16, v24, v16
-; CHECK-NEXT:    vfmax.vv v8, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vfmax.vv v8, v24, v8
+; CHECK-NEXT:    vfmax.vv v16, v16, v0
+; CHECK-NEXT:    vfmax.vv v8, v16, v8
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -3111,79 +3045,60 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    addi a1, a0, 384
 ; CHECK-NEXT:    vle64.v v8, (a1)
+; CHECK-NEXT:    addi a1, a0, 384
+; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vle64.v v24, (a1)
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v16, v8
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vfmax.vv v16, v8, v16
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v24, v24, v8
-; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v7, v24, v24
+; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmax.vv v8, v8, v16
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
@@ -3198,10 +3113,7 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:  .LBB161_3:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -3217,9 +3129,9 @@ define double @vreduce_fmaximum_v64f64_nonans(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    addi a1, a0, 384
+; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    addi a2, a0, 384
-; CHECK-NEXT:    vle64.v v16, (a2)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle64.v v24, (a0)
 ; CHECK-NEXT:    vle64.v v0, (a1)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
index 0a3c4874c5e8ba..f920e39e7d295c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
@@ -811,9 +811,9 @@ define signext i32 @vpreduce_xor_v64i32(i32 signext %s, <64 x i32> %v, <64 x i1>
 ; CHECK-NEXT:  .LBB49_2:
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v25, a0
+; CHECK-NEXT:    addi a0, a1, -32
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vredxor.vs v25, v8, v25, v0.t
-; CHECK-NEXT:    addi a0, a1, -32
 ; CHECK-NEXT:    sltu a1, a1, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
@@ -838,10 +838,10 @@ define signext i64 @vpreduce_add_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredsum.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -873,10 +873,10 @@ define signext i64 @vpreduce_umax_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -908,10 +908,10 @@ define signext i64 @vpreduce_smax_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredmax.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -943,10 +943,10 @@ define signext i64 @vpreduce_umin_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredminu.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -978,10 +978,10 @@ define signext i64 @vpreduce_smin_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredmin.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1013,10 +1013,10 @@ define signext i64 @vpreduce_and_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredand.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1048,10 +1048,10 @@ define signext i64 @vpreduce_or_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredor.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1083,10 +1083,10 @@ define signext i64 @vpreduce_xor_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredxor.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1118,10 +1118,10 @@ define signext i64 @vpreduce_add_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredsum.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1153,10 +1153,10 @@ define signext i64 @vpreduce_umax_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredmaxu.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1188,10 +1188,10 @@ define signext i64 @vpreduce_smax_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredmax.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1223,10 +1223,10 @@ define signext i64 @vpreduce_umin_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredminu.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1258,10 +1258,10 @@ define signext i64 @vpreduce_smin_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredmin.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1293,10 +1293,10 @@ define signext i64 @vpreduce_and_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredand.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1328,10 +1328,10 @@ define signext i64 @vpreduce_or_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredor.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1363,10 +1363,10 @@ define signext i64 @vpreduce_xor_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredxor.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1789,24 +1789,24 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m,
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    li a3, 32
-; RV32-NEXT:    lui a2, %hi(.LCPI72_0)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI72_0)
-; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT:    vle8.v v12, (a2)
 ; RV32-NEXT:    mv a2, a0
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    lui a3, %hi(.LCPI72_0)
+; RV32-NEXT:    addi a3, a3, %lo(.LCPI72_0)
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vle8.v v12, (a3)
 ; RV32-NEXT:    vid.v v16
 ; RV32-NEXT:    vmsltu.vx v14, v16, a1
+; RV32-NEXT:    li a3, 64
 ; RV32-NEXT:    vsext.vf4 v16, v12
 ; RV32-NEXT:    vmsltu.vx v12, v16, a1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vslideup.vi v14, v12, 4
-; RV32-NEXT:    li a0, 64
-; RV32-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, a3, e8, m4, ta, ma
 ; RV32-NEXT:    vmand.mm v0, v14, v0
 ; RV32-NEXT:    vmv.v.i v12, 1
 ; RV32-NEXT:    vmerge.vvm v8, v12, v8, v0
-; RV32-NEXT:    vslidedown.vx v12, v8, a3
+; RV32-NEXT:    vslidedown.vx v12, v8, a0
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 16
 ; RV32-NEXT:    vmul.vv v8, v8, v12
@@ -1835,24 +1835,24 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m,
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    li a3, 32
-; RV64-NEXT:    lui a2, %hi(.LCPI72_0)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI72_0)
-; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; RV64-NEXT:    vle8.v v12, (a2)
 ; RV64-NEXT:    mv a2, a0
+; RV64-NEXT:    li a0, 32
+; RV64-NEXT:    lui a3, %hi(.LCPI72_0)
+; RV64-NEXT:    addi a3, a3, %lo(.LCPI72_0)
+; RV64-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV64-NEXT:    vle8.v v12, (a3)
 ; RV64-NEXT:    vid.v v16
 ; RV64-NEXT:    vmsltu.vx v14, v16, a1
+; RV64-NEXT:    li a3, 64
 ; RV64-NEXT:    vsext.vf4 v16, v12
 ; RV64-NEXT:    vmsltu.vx v12, v16, a1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vslideup.vi v14, v12, 4
-; RV64-NEXT:    li a0, 64
-; RV64-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; RV64-NEXT:    vsetvli zero, a3, e8, m4, ta, ma
 ; RV64-NEXT:    vmand.mm v0, v14, v0
 ; RV64-NEXT:    vmv.v.i v12, 1
 ; RV64-NEXT:    vmerge.vvm v8, v12, v8, v0
-; RV64-NEXT:    vslidedown.vx v12, v8, a3
+; RV64-NEXT:    vslidedown.vx v12, v8, a0
 ; RV64-NEXT:    vmul.vv v8, v8, v12
 ; RV64-NEXT:    vslidedown.vi v12, v8, 16
 ; RV64-NEXT:    vmul.vv v8, v8, v12

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
index a4a104abd2ef88..2ea618bf8a2260 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
@@ -967,8 +967,8 @@ define i64 @vwreduce_add_v1i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vsext.vf2 v9, v8
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsext.vf2 v9, v8
 ; RV32-NEXT:    vsrl.vx v8, v9, a0
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    vmv.x.s a0, v9
@@ -992,8 +992,8 @@ define i64 @vwreduce_uadd_v1i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vzext.vf2 v9, v8
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vzext.vf2 v9, v8
 ; RV32-NEXT:    vsrl.vx v8, v9, a0
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    vmv.x.s a0, v9
@@ -1020,9 +1020,9 @@ define i64 @vreduce_add_v2i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredsum.vs v8, v8, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1047,11 +1047,11 @@ define i64 @vwreduce_add_v2i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    vwredsum.vs v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1078,11 +1078,11 @@ define i64 @vwreduce_uadd_v2i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    vwredsumu.vs v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1111,9 +1111,9 @@ define i64 @vreduce_add_v4i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v10, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredsum.vs v8, v8, v10
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1138,11 +1138,11 @@ define i64 @vwreduce_add_v4i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vwredsum.vs v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1169,11 +1169,11 @@ define i64 @vwreduce_uadd_v4i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vwredsumu.vs v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1202,9 +1202,9 @@ define i64 @vreduce_add_v8i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v12, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredsum.vs v8, v8, v12
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1229,11 +1229,11 @@ define i64 @vwreduce_add_v8i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v10, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vwredsum.vs v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1260,11 +1260,11 @@ define i64 @vwreduce_uadd_v8i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v10, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vwredsumu.vs v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1293,9 +1293,9 @@ define i64 @vreduce_add_v16i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v16, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredsum.vs v8, v8, v16
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1320,11 +1320,11 @@ define i64 @vwreduce_add_v16i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v12, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vwredsum.vs v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1351,11 +1351,11 @@ define i64 @vwreduce_uadd_v16i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v12, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vwredsumu.vs v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1387,9 +1387,9 @@ define i64 @vreduce_add_v32i64(ptr %x) {
 ; RV32-NEXT:    vle64.v v16, (a0)
 ; RV32-NEXT:    vadd.vv v8, v8, v16
 ; RV32-NEXT:    vmv.s.x v16, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredsum.vs v8, v8, v16
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1494,21 +1494,21 @@ declare i64 @llvm.vector.reduce.add.v64i64(<64 x i64>)
 define i64 @vreduce_add_v64i64(ptr %x) nounwind {
 ; RV32-LABEL: vreduce_add_v64i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a1, a0, 384
-; RV32-NEXT:    vle64.v v16, (a1)
-; RV32-NEXT:    addi a1, a0, 256
-; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v24, (a0)
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    vle64.v v0, (a1)
-; RV32-NEXT:    vadd.vv v16, v24, v16
-; RV32-NEXT:    vadd.vv v8, v8, v0
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    addi a0, a0, 256
+; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    vadd.vv v24, v0, v24
+; RV32-NEXT:    vmv.s.x v7, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vadd.vv v8, v8, v16
-; RV32-NEXT:    vmv.s.x v16, zero
-; RV32-NEXT:    vredsum.vs v8, v8, v16
+; RV32-NEXT:    vadd.vv v8, v8, v24
+; RV32-NEXT:    vredsum.vs v8, v8, v7
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1539,51 +1539,27 @@ define i64 @vreduce_add_v64i64(ptr %x) nounwind {
 define i64 @vwreduce_add_v64i64(ptr %x) {
 ; RV32-LABEL: vwreduce_add_v64i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vle32.v v16, (a0)
+; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v8, 16
 ; RV32-NEXT:    vslidedown.vi v0, v16, 16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vwadd.vv v8, v24, v0
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vwadd.vv v0, v8, v16
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vwadd.vv v24, v16, v8
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vwadd.vv v16, v0, v8
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vadd.vv v8, v24, v16
 ; RV32-NEXT:    vmv.s.x v16, zero
 ; RV32-NEXT:    vredsum.vs v8, v8, v16
 ; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    vmv.x.s a1, v8
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add sp, sp, a2
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vwreduce_add_v64i64:
@@ -1591,41 +1567,30 @@ define i64 @vwreduce_add_v64i64(ptr %x) {
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    addi a1, a0, 128
 ; RV64-NEXT:    li a2, 32
 ; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV64-NEXT:    vle32.v v16, (a1)
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v8, 16
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; RV64-NEXT:    vslidedown.vi v0, v16, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vwadd.vv v8, v24, v0
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vwadd.vv v0, v8, v16
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vwadd.vv v24, v8, v16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vwadd.vv v8, v16, v0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v8, v0, v8
+; RV64-NEXT:    vadd.vv v8, v24, v8
 ; RV64-NEXT:    vmv.s.x v16, zero
 ; RV64-NEXT:    vredsum.vs v8, v8, v16
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add sp, sp, a1
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
 ; RV64-NEXT:    addi sp, sp, 16
@@ -1640,51 +1605,27 @@ define i64 @vwreduce_add_v64i64(ptr %x) {
 define i64 @vwreduce_uadd_v64i64(ptr %x) {
 ; RV32-LABEL: vwreduce_uadd_v64i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vle32.v v16, (a1)
+; RV32-NEXT:    vle32.v v16, (a0)
+; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v8, 16
 ; RV32-NEXT:    vslidedown.vi v0, v16, 16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vwaddu.vv v8, v24, v0
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vwaddu.vv v0, v8, v16
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vwaddu.vv v24, v16, v8
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 16
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vwaddu.vv v16, v0, v8
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV32-NEXT:    vadd.vv v8, v0, v8
+; RV32-NEXT:    vadd.vv v8, v24, v16
 ; RV32-NEXT:    vmv.s.x v16, zero
 ; RV32-NEXT:    vredsum.vs v8, v8, v16
 ; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a2
 ; RV32-NEXT:    vmv.x.s a1, v8
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 4
-; RV32-NEXT:    add sp, sp, a2
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vwreduce_uadd_v64i64:
@@ -1692,41 +1633,30 @@ define i64 @vwreduce_uadd_v64i64(ptr %x) {
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; RV64-NEXT:    addi a1, a0, 128
 ; RV64-NEXT:    li a2, 32
 ; RV64-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; RV64-NEXT:    vle32.v v16, (a1)
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v8, 16
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; RV64-NEXT:    vslidedown.vi v0, v16, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV64-NEXT:    vwaddu.vv v8, v24, v0
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vwaddu.vv v0, v8, v16
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vwaddu.vv v24, v8, v16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vwaddu.vv v8, v16, v0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; RV64-NEXT:    vadd.vv v8, v0, v8
+; RV64-NEXT:    vadd.vv v8, v24, v8
 ; RV64-NEXT:    vmv.s.x v16, zero
 ; RV64-NEXT:    vredsum.vs v8, v8, v16
 ; RV64-NEXT:    vmv.x.s a0, v8
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add sp, sp, a1
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
 ; RV64-NEXT:    addi sp, sp, 16
@@ -2162,8 +2092,8 @@ define i64 @vreduce_and_v2i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vredand.vs v8, v8, v8
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vredand.vs v8, v8, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
@@ -2189,9 +2119,9 @@ define i64 @vreduce_and_v4i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredand.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2216,9 +2146,9 @@ define i64 @vreduce_and_v8i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredand.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2243,9 +2173,9 @@ define i64 @vreduce_and_v16i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredand.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2272,10 +2202,10 @@ define i64 @vreduce_and_v32i64(ptr %x) {
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vredand.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2307,14 +2237,14 @@ define i64 @vreduce_and_v64i64(ptr %x) nounwind {
 ; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v24, (a0)
-; RV32-NEXT:    vle64.v v0, (a1)
-; RV32-NEXT:    vand.vv v16, v24, v16
-; RV32-NEXT:    vand.vv v8, v8, v0
+; RV32-NEXT:    vle64.v v0, (a0)
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vand.vv v16, v0, v16
+; RV32-NEXT:    vand.vv v8, v8, v24
 ; RV32-NEXT:    vand.vv v8, v8, v16
 ; RV32-NEXT:    vredand.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2324,9 +2254,9 @@ define i64 @vreduce_and_v64i64(ptr %x) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    addi a1, a0, 384
+; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
-; RV64-NEXT:    addi a2, a0, 384
-; RV64-NEXT:    vle64.v v16, (a2)
 ; RV64-NEXT:    addi a0, a0, 128
 ; RV64-NEXT:    vle64.v v24, (a0)
 ; RV64-NEXT:    vle64.v v0, (a1)
@@ -2763,8 +2693,8 @@ define i64 @vreduce_or_v2i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vredor.vs v8, v8, v8
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vredor.vs v8, v8, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
@@ -2790,9 +2720,9 @@ define i64 @vreduce_or_v4i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredor.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2817,9 +2747,9 @@ define i64 @vreduce_or_v8i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredor.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2844,9 +2774,9 @@ define i64 @vreduce_or_v16i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredor.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2873,10 +2803,10 @@ define i64 @vreduce_or_v32i64(ptr %x) {
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vredor.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2908,14 +2838,14 @@ define i64 @vreduce_or_v64i64(ptr %x) nounwind {
 ; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v24, (a0)
-; RV32-NEXT:    vle64.v v0, (a1)
-; RV32-NEXT:    vor.vv v16, v24, v16
-; RV32-NEXT:    vor.vv v8, v8, v0
+; RV32-NEXT:    vle64.v v0, (a0)
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vor.vv v16, v0, v16
+; RV32-NEXT:    vor.vv v8, v8, v24
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    vredor.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2925,9 +2855,9 @@ define i64 @vreduce_or_v64i64(ptr %x) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    addi a1, a0, 384
+; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
-; RV64-NEXT:    addi a2, a0, 384
-; RV64-NEXT:    vle64.v v16, (a2)
 ; RV64-NEXT:    addi a0, a0, 128
 ; RV64-NEXT:    vle64.v v24, (a0)
 ; RV64-NEXT:    vle64.v v0, (a1)
@@ -3386,9 +3316,9 @@ define i64 @vreduce_xor_v2i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredxor.vs v8, v8, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -3415,9 +3345,9 @@ define i64 @vreduce_xor_v4i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v10, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredxor.vs v8, v8, v10
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -3444,9 +3374,9 @@ define i64 @vreduce_xor_v8i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v12, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredxor.vs v8, v8, v12
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -3473,9 +3403,9 @@ define i64 @vreduce_xor_v16i64(ptr %x) {
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    vmv.s.x v16, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredxor.vs v8, v8, v16
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -3505,9 +3435,9 @@ define i64 @vreduce_xor_v32i64(ptr %x) {
 ; RV32-NEXT:    vle64.v v16, (a0)
 ; RV32-NEXT:    vxor.vv v8, v8, v16
 ; RV32-NEXT:    vmv.s.x v16, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredxor.vs v8, v8, v16
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -3534,21 +3464,21 @@ declare i64 @llvm.vector.reduce.xor.v64i64(<64 x i64>)
 define i64 @vreduce_xor_v64i64(ptr %x) nounwind {
 ; RV32-LABEL: vreduce_xor_v64i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a1, a0, 384
-; RV32-NEXT:    vle64.v v16, (a1)
-; RV32-NEXT:    addi a1, a0, 256
-; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v24, (a0)
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    vle64.v v0, (a1)
-; RV32-NEXT:    vxor.vv v16, v24, v16
-; RV32-NEXT:    vxor.vv v8, v8, v0
+; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    addi a0, a0, 256
+; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    vxor.vv v24, v0, v24
+; RV32-NEXT:    vmv.s.x v7, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vxor.vv v8, v8, v16
-; RV32-NEXT:    vmv.s.x v16, zero
-; RV32-NEXT:    vredxor.vs v8, v8, v16
+; RV32-NEXT:    vxor.vv v8, v8, v24
+; RV32-NEXT:    vredxor.vs v8, v8, v7
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -3999,8 +3929,8 @@ define i64 @vreduce_smin_v2i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vredmin.vs v8, v8, v8
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vredmin.vs v8, v8, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
@@ -4026,9 +3956,9 @@ define i64 @vreduce_smin_v4i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredmin.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4053,9 +3983,9 @@ define i64 @vreduce_smin_v8i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredmin.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4080,9 +4010,9 @@ define i64 @vreduce_smin_v16i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredmin.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4109,10 +4039,10 @@ define i64 @vreduce_smin_v32i64(ptr %x) {
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vmin.vv v8, v8, v16
 ; RV32-NEXT:    vredmin.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4144,14 +4074,14 @@ define i64 @vreduce_smin_v64i64(ptr %x) nounwind {
 ; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v24, (a0)
-; RV32-NEXT:    vle64.v v0, (a1)
-; RV32-NEXT:    vmin.vv v16, v24, v16
-; RV32-NEXT:    vmin.vv v8, v8, v0
+; RV32-NEXT:    vle64.v v0, (a0)
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmin.vv v16, v0, v16
+; RV32-NEXT:    vmin.vv v8, v8, v24
 ; RV32-NEXT:    vmin.vv v8, v8, v16
 ; RV32-NEXT:    vredmin.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4161,9 +4091,9 @@ define i64 @vreduce_smin_v64i64(ptr %x) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    addi a1, a0, 384
+; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
-; RV64-NEXT:    addi a2, a0, 384
-; RV64-NEXT:    vle64.v v16, (a2)
 ; RV64-NEXT:    addi a0, a0, 128
 ; RV64-NEXT:    vle64.v v24, (a0)
 ; RV64-NEXT:    vle64.v v0, (a1)
@@ -4601,8 +4531,8 @@ define i64 @vreduce_smax_v2i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vredmax.vs v8, v8, v8
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vredmax.vs v8, v8, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
@@ -4628,9 +4558,9 @@ define i64 @vreduce_smax_v4i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredmax.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4655,9 +4585,9 @@ define i64 @vreduce_smax_v8i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredmax.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4682,9 +4612,9 @@ define i64 @vreduce_smax_v16i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredmax.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4711,10 +4641,10 @@ define i64 @vreduce_smax_v32i64(ptr %x) {
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vmax.vv v8, v8, v16
 ; RV32-NEXT:    vredmax.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4746,14 +4676,14 @@ define i64 @vreduce_smax_v64i64(ptr %x) nounwind {
 ; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v24, (a0)
-; RV32-NEXT:    vle64.v v0, (a1)
-; RV32-NEXT:    vmax.vv v16, v24, v16
-; RV32-NEXT:    vmax.vv v8, v8, v0
+; RV32-NEXT:    vle64.v v0, (a0)
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmax.vv v16, v0, v16
+; RV32-NEXT:    vmax.vv v8, v8, v24
 ; RV32-NEXT:    vmax.vv v8, v8, v16
 ; RV32-NEXT:    vredmax.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -4763,9 +4693,9 @@ define i64 @vreduce_smax_v64i64(ptr %x) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    addi a1, a0, 384
+; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
-; RV64-NEXT:    addi a2, a0, 384
-; RV64-NEXT:    vle64.v v16, (a2)
 ; RV64-NEXT:    addi a0, a0, 128
 ; RV64-NEXT:    vle64.v v24, (a0)
 ; RV64-NEXT:    vle64.v v0, (a1)
@@ -5203,8 +5133,8 @@ define i64 @vreduce_umin_v2i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vredminu.vs v8, v8, v8
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vredminu.vs v8, v8, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
@@ -5230,9 +5160,9 @@ define i64 @vreduce_umin_v4i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredminu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5257,9 +5187,9 @@ define i64 @vreduce_umin_v8i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredminu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5284,9 +5214,9 @@ define i64 @vreduce_umin_v16i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredminu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5313,10 +5243,10 @@ define i64 @vreduce_umin_v32i64(ptr %x) {
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vminu.vv v8, v8, v16
 ; RV32-NEXT:    vredminu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5348,14 +5278,14 @@ define i64 @vreduce_umin_v64i64(ptr %x) nounwind {
 ; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v24, (a0)
-; RV32-NEXT:    vle64.v v0, (a1)
-; RV32-NEXT:    vminu.vv v16, v24, v16
-; RV32-NEXT:    vminu.vv v8, v8, v0
+; RV32-NEXT:    vle64.v v0, (a0)
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vminu.vv v16, v0, v16
+; RV32-NEXT:    vminu.vv v8, v8, v24
 ; RV32-NEXT:    vminu.vv v8, v8, v16
 ; RV32-NEXT:    vredminu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5365,9 +5295,9 @@ define i64 @vreduce_umin_v64i64(ptr %x) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    addi a1, a0, 384
+; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
-; RV64-NEXT:    addi a2, a0, 384
-; RV64-NEXT:    vle64.v v16, (a2)
 ; RV64-NEXT:    addi a0, a0, 128
 ; RV64-NEXT:    vle64.v v24, (a0)
 ; RV64-NEXT:    vle64.v v0, (a1)
@@ -5804,8 +5734,8 @@ define i64 @vreduce_umax_v2i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
-; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-NEXT:    vmv.x.s a1, v9
@@ -5831,9 +5761,9 @@ define i64 @vreduce_umax_v4i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5858,9 +5788,9 @@ define i64 @vreduce_umax_v8i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5885,9 +5815,9 @@ define i64 @vreduce_umax_v16i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5914,10 +5844,10 @@ define i64 @vreduce_umax_v32i64(ptr %x) {
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a0, a0, 128
 ; RV32-NEXT:    vle64.v v16, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vmaxu.vv v8, v8, v16
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5949,14 +5879,14 @@ define i64 @vreduce_umax_v64i64(ptr %x) nounwind {
 ; RV32-NEXT:    vle64.v v16, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
 ; RV32-NEXT:    addi a0, a0, 128
-; RV32-NEXT:    vle64.v v24, (a0)
-; RV32-NEXT:    vle64.v v0, (a1)
-; RV32-NEXT:    vmaxu.vv v16, v24, v16
-; RV32-NEXT:    vmaxu.vv v8, v8, v0
+; RV32-NEXT:    vle64.v v0, (a0)
+; RV32-NEXT:    vle64.v v24, (a1)
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmaxu.vv v16, v0, v16
+; RV32-NEXT:    vmaxu.vv v8, v8, v24
 ; RV32-NEXT:    vmaxu.vv v8, v8, v16
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -5966,9 +5896,9 @@ define i64 @vreduce_umax_v64i64(ptr %x) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a0)
+; RV64-NEXT:    addi a1, a0, 384
+; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 256
-; RV64-NEXT:    addi a2, a0, 384
-; RV64-NEXT:    vle64.v v16, (a2)
 ; RV64-NEXT:    addi a0, a0, 128
 ; RV64-NEXT:    vle64.v v24, (a0)
 ; RV64-NEXT:    vle64.v v0, (a1)
@@ -6191,8 +6121,8 @@ define i8 @vreduce_mul_v256i8(ptr %x) {
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle8.v v16, (a0)
-; CHECK-NEXT:    vmul.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    vmul.vv v8, v8, v16
 ; CHECK-NEXT:    vslidedown.vx v16, v8, a0
 ; CHECK-NEXT:    vmul.vv v8, v8, v16
 ; CHECK-NEXT:    li a0, 32
@@ -6565,9 +6495,9 @@ define i64 @vreduce_mul_v2i64(ptr %x) {
 ; RV32-NEXT:    vle64.v v8, (a0)
 ; RV32-NEXT:    addi a0, a0, 8
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vmul.vv v8, v8, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -6593,12 +6523,12 @@ define i64 @vreduce_mul_v4i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vslidedown.vi v10, v8, 2
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    vrgather.vi v10, v8, 1
 ; RV32-NEXT:    vmul.vv v8, v8, v10
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m2, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -6626,6 +6556,7 @@ define i64 @vreduce_mul_v8i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vslidedown.vi v12, v8, 4
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    vslidedown.vi v12, v8, 2
@@ -6633,7 +6564,6 @@ define i64 @vreduce_mul_v8i64(ptr %x) {
 ; RV32-NEXT:    vrgather.vi v12, v8, 1
 ; RV32-NEXT:    vmul.vv v8, v8, v12
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m4, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -6663,6 +6593,7 @@ define i64 @vreduce_mul_v16i64(ptr %x) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle64.v v8, (a0)
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vslidedown.vi v16, v8, 8
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 4
@@ -6672,7 +6603,6 @@ define i64 @vreduce_mul_v16i64(ptr %x) {
 ; RV32-NEXT:    vrgather.vi v16, v8, 1
 ; RV32-NEXT:    vmul.vv v8, v8, v16
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
index dc0f4e74305550..ad358d73202402 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll
@@ -231,16 +231,18 @@ declare i1 @llvm.vp.reduce.and.v256i1(i1, <256 x i1>, <256 x i1>, i32)
 define zeroext i1 @vpreduce_and_v256i1(i1 zeroext %s, <256 x i1> %v, <256 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpreduce_and_v256i1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v11, v9
+; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:    bltu a1, a3, .LBB14_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:  .LBB14_2:
+; CHECK-NEXT:    vmv1r.v v0, v11
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmnot.m v11, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vcpop.m a2, v11, v0.t
+; CHECK-NEXT:    vmnot.m v9, v9
+; CHECK-NEXT:    vcpop.m a2, v9, v0.t
 ; CHECK-NEXT:    seqz a2, a2
 ; CHECK-NEXT:    and a0, a2, a0
 ; CHECK-NEXT:    addi a2, a1, -128

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
index b27492b43cfbbd..b8617fda3aa7ec 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
@@ -123,9 +123,9 @@ declare <16 x half> @llvm.vp.rint.v16f16(<16 x half>, <16 x i1>, i32)
 define <16 x half> @vp_rint_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_v16f16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI6_0)
 ; CHECK-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -357,9 +357,9 @@ declare <4 x double> @llvm.vp.rint.v4f64(<4 x double>, <4 x i1>, i32)
 define <4 x double> @vp_rint_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_v4f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a1)
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -397,9 +397,9 @@ declare <8 x double> @llvm.vp.rint.v8f64(<8 x double>, <8 x i1>, i32)
 define <8 x double> @vp_rint_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_v8f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a1)
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
@@ -437,9 +437,9 @@ declare <15 x double> @llvm.vp.rint.v15f64(<15 x double>, <15 x i1>, i32)
 define <15 x double> @vp_rint_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_v15f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -477,9 +477,9 @@ declare <16 x double> @llvm.vp.rint.v16f64(<16 x double>, <16 x i1>, i32)
 define <16 x double> @vp_rint_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_v16f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -532,26 +532,27 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
@@ -583,15 +584,15 @@ define <32 x double> @vp_rint_v32f64_unmasked(<32 x double> %va, i32 zeroext %ev
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB27_2:
-; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
+; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
 ; CHECK-NEXT:    addi a2, a0, -16
 ; CHECK-NEXT:    sltu a0, a0, a2
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v7, v24, fa5

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
index 315fe257626fd8..820a05e3d6042b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
@@ -32,10 +32,10 @@ define <2 x half> @vp_round_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -72,10 +72,10 @@ define <2 x half> @vp_round_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -114,10 +114,10 @@ define <4 x half> @vp_round_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -154,10 +154,10 @@ define <4 x half> @vp_round_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -197,10 +197,10 @@ define <8 x half> @vp_round_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl)
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -238,10 +238,10 @@ define <8 x half> @vp_round_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -261,9 +261,9 @@ declare <16 x half> @llvm.vp.round.v16f16(<16 x half>, <16 x i1>, i32)
 define <16 x half> @vp_round_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_v16f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -283,10 +283,10 @@ define <16 x half> @vp_round_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -324,10 +324,10 @@ define <16 x half> @vp_round_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -561,9 +561,9 @@ declare <4 x double> @llvm.vp.round.v4f64(<4 x double>, <4 x i1>, i32)
 define <4 x double> @vp_round_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v4f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a1)
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -605,9 +605,9 @@ declare <8 x double> @llvm.vp.round.v8f64(<8 x double>, <8 x i1>, i32)
 define <8 x double> @vp_round_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v8f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a1)
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
@@ -649,9 +649,9 @@ declare <15 x double> @llvm.vp.round.v15f64(<15 x double>, <15 x i1>, i32)
 define <15 x double> @vp_round_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v15f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -693,9 +693,9 @@ declare <16 x double> @llvm.vp.round.v16f64(<16 x double>, <16 x i1>, i32)
 define <16 x double> @vp_round_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v16f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -737,59 +737,69 @@ declare <32 x double> @llvm.vp.round.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v6, v0
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a1, 4
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -808,27 +818,30 @@ define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %e
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB27_2:
-; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
+; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
+; CHECK-NEXT:    addi a2, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a2
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    fsrmi a2, 4
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a1, 4
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrm a2
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsrmi a1, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
index b510532408cb8c..8391c7939180a0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
@@ -32,10 +32,10 @@ define <2 x half> @vp_roundeven_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -72,10 +72,10 @@ define <2 x half> @vp_roundeven_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -114,10 +114,10 @@ define <4 x half> @vp_roundeven_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -154,10 +154,10 @@ define <4 x half> @vp_roundeven_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -197,10 +197,10 @@ define <8 x half> @vp_roundeven_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -238,10 +238,10 @@ define <8 x half> @vp_roundeven_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -261,9 +261,9 @@ declare <16 x half> @llvm.vp.roundeven.v16f16(<16 x half>, <16 x i1>, i32)
 define <16 x half> @vp_roundeven_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_v16f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -283,10 +283,10 @@ define <16 x half> @vp_roundeven_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -324,10 +324,10 @@ define <16 x half> @vp_roundeven_v16f16_unmasked(<16 x half> %va, i32 zeroext %e
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -561,9 +561,9 @@ declare <4 x double> @llvm.vp.roundeven.v4f64(<4 x double>, <4 x i1>, i32)
 define <4 x double> @vp_roundeven_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v4f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a1)
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -605,9 +605,9 @@ declare <8 x double> @llvm.vp.roundeven.v8f64(<8 x double>, <8 x i1>, i32)
 define <8 x double> @vp_roundeven_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v8f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a1)
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
@@ -649,9 +649,9 @@ declare <15 x double> @llvm.vp.roundeven.v15f64(<15 x double>, <15 x i1>, i32)
 define <15 x double> @vp_roundeven_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v15f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -693,9 +693,9 @@ declare <16 x double> @llvm.vp.roundeven.v16f64(<16 x double>, <16 x i1>, i32)
 define <16 x double> @vp_roundeven_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v16f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -737,59 +737,69 @@ declare <32 x double> @llvm.vp.roundeven.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v6, v0
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a1, 0
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -808,27 +818,30 @@ define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroex
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB27_2:
-; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
+; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
+; CHECK-NEXT:    addi a2, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a2
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    fsrmi a2, 0
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a1, 0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrm a2
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsrmi a1, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
index d0a41a2bb968ca..8c38d244602655 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
@@ -32,10 +32,10 @@ define <2 x half> @vp_roundtozero_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
@@ -72,10 +72,10 @@ define <2 x half> @vp_roundtozero_v2f16_unmasked(<2 x half> %va, i32 zeroext %ev
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -114,10 +114,10 @@ define <4 x half> @vp_roundtozero_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
@@ -154,10 +154,10 @@ define <4 x half> @vp_roundtozero_v4f16_unmasked(<4 x half> %va, i32 zeroext %ev
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -197,10 +197,10 @@ define <8 x half> @vp_roundtozero_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
@@ -238,10 +238,10 @@ define <8 x half> @vp_roundtozero_v8f16_unmasked(<8 x half> %va, i32 zeroext %ev
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -261,9 +261,9 @@ declare <16 x half> @llvm.vp.roundtozero.v16f16(<16 x half>, <16 x i1>, i32)
 define <16 x half> @vp_roundtozero_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_v16f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI6_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI6_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -283,10 +283,10 @@ define <16 x half> @vp_roundtozero_v16f16(<16 x half> %va, <16 x i1> %m, i32 zer
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
@@ -324,10 +324,10 @@ define <16 x half> @vp_roundtozero_v16f16_unmasked(<16 x half> %va, i32 zeroext
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -561,9 +561,9 @@ declare <4 x double> @llvm.vp.roundtozero.v4f64(<4 x double>, <4 x i1>, i32)
 define <4 x double> @vp_roundtozero_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v4f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI18_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI18_0)(a1)
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -605,9 +605,9 @@ declare <8 x double> @llvm.vp.roundtozero.v8f64(<8 x double>, <8 x i1>, i32)
 define <8 x double> @vp_roundtozero_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v8f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI20_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI20_0)(a1)
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
@@ -649,9 +649,9 @@ declare <15 x double> @llvm.vp.roundtozero.v15f64(<15 x double>, <15 x i1>, i32)
 define <15 x double> @vp_roundtozero_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v15f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI22_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI22_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -693,9 +693,9 @@ declare <16 x double> @llvm.vp.roundtozero.v16f64(<16 x double>, <16 x i1>, i32)
 define <16 x double> @vp_roundtozero_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v16f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI24_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI24_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -737,59 +737,69 @@ declare <32 x double> @llvm.vp.roundtozero.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv1r.v v6, v0
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    vmv1r.v v25, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v7, v0, 2
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
+; CHECK-NEXT:    addi a1, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16, v0.t
+; CHECK-NEXT:    vfabs.v v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -808,27 +818,30 @@ define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zero
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB27_2:
-; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8
+; CHECK-NEXT:    lui a2, %hi(.LCPI27_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI27_0)(a2)
+; CHECK-NEXT:    addi a2, a0, -16
+; CHECK-NEXT:    sltu a0, a0, a2
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    fsrmi a2, 1
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a1, 1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    fsrm a2
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    fsrmi a1, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    addi a1, a0, -16
-; CHECK-NEXT:    sltu a0, a0, a1
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
-; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    fsrm a0
+; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll
index 80561be0ca2f5f..8da605d35270de 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll
@@ -102,37 +102,37 @@ define signext i32 @sad_2block_16xi8_as_i32(ptr %a, ptr %b, i32 signext %stridea
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    vle8.v v10, (a0)
 ; CHECK-NEXT:    vle8.v v11, (a1)
-; CHECK-NEXT:    vminu.vv v12, v8, v9
-; CHECK-NEXT:    vmaxu.vv v8, v8, v9
-; CHECK-NEXT:    vsub.vv v8, v8, v12
-; CHECK-NEXT:    vminu.vv v9, v10, v11
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    vle8.v v12, (a0)
 ; CHECK-NEXT:    vle8.v v13, (a1)
-; CHECK-NEXT:    vmaxu.vv v10, v10, v11
-; CHECK-NEXT:    vsub.vv v9, v10, v9
-; CHECK-NEXT:    vwaddu.vv v10, v9, v8
-; CHECK-NEXT:    vminu.vv v8, v12, v13
-; CHECK-NEXT:    vmaxu.vv v9, v12, v13
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a1, a1, a3
-; CHECK-NEXT:    vle8.v v12, (a0)
-; CHECK-NEXT:    vle8.v v13, (a1)
-; CHECK-NEXT:    vsub.vv v8, v9, v8
+; CHECK-NEXT:    vminu.vv v14, v8, v9
+; CHECK-NEXT:    vmaxu.vv v8, v8, v9
+; CHECK-NEXT:    vle8.v v9, (a0)
+; CHECK-NEXT:    vsub.vv v8, v8, v14
+; CHECK-NEXT:    vminu.vv v14, v10, v11
+; CHECK-NEXT:    vmaxu.vv v10, v10, v11
+; CHECK-NEXT:    vle8.v v11, (a1)
+; CHECK-NEXT:    vsub.vv v10, v10, v14
+; CHECK-NEXT:    vminu.vv v14, v12, v13
+; CHECK-NEXT:    vmaxu.vv v12, v12, v13
+; CHECK-NEXT:    vwaddu.vv v16, v10, v8
+; CHECK-NEXT:    vsub.vv v8, v12, v14
+; CHECK-NEXT:    vminu.vv v10, v9, v11
+; CHECK-NEXT:    vmaxu.vv v9, v9, v11
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v14, v8
-; CHECK-NEXT:    vwaddu.vv v16, v14, v10
+; CHECK-NEXT:    vzext.vf2 v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vminu.vv v8, v12, v13
-; CHECK-NEXT:    vmaxu.vv v9, v12, v13
-; CHECK-NEXT:    vsub.vv v8, v9, v8
+; CHECK-NEXT:    vsub.vv v8, v9, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vwaddu.vv v20, v12, v16
 ; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vwaddu.wv v16, v16, v10
+; CHECK-NEXT:    vwaddu.wv v20, v20, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmv.s.x v8, zero
-; CHECK-NEXT:    vredsum.vs v8, v16, v8
+; CHECK-NEXT:    vredsum.vs v8, v20, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll
index 4621f339ca8828..6b81b781a898f9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-scalarized.ll
@@ -14,33 +14,31 @@ define <8 x float> @fpext_v8bf16(<8 x bfloat> %x) {
 ; CHECK-NEXT:    fmv.x.w a6, fa6
 ; CHECK-NEXT:    fmv.x.w a7, fa7
 ; CHECK-NEXT:    slli a7, a7, 16
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.s.x v8, a7
 ; CHECK-NEXT:    slli a6, a6, 16
-; CHECK-NEXT:    vmv.s.x v9, a6
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
 ; CHECK-NEXT:    slli a5, a5, 16
-; CHECK-NEXT:    vmv.s.x v8, a5
 ; CHECK-NEXT:    slli a4, a4, 16
-; CHECK-NEXT:    vmv.s.x v10, a4
-; CHECK-NEXT:    vslideup.vi v10, v8, 1
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v9, 2
 ; CHECK-NEXT:    slli a3, a3, 16
-; CHECK-NEXT:    vmv.s.x v8, a3
 ; CHECK-NEXT:    slli a2, a2, 16
-; CHECK-NEXT:    vmv.s.x v9, a2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
 ; CHECK-NEXT:    slli a1, a1, 16
-; CHECK-NEXT:    vmv.s.x v11, a1
 ; CHECK-NEXT:    slli a0, a0, 16
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vmv.s.x v8, a7
+; CHECK-NEXT:    vmv.s.x v9, a6
+; CHECK-NEXT:    vmv.s.x v10, a5
+; CHECK-NEXT:    vmv.s.x v12, a4
+; CHECK-NEXT:    vmv.s.x v11, a3
+; CHECK-NEXT:    vmv.s.x v13, a2
+; CHECK-NEXT:    vslideup.vi v9, v8, 1
+; CHECK-NEXT:    vmv.s.x v14, a1
+; CHECK-NEXT:    vslideup.vi v12, v10, 1
+; CHECK-NEXT:    vslideup.vi v13, v11, 1
 ; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    vslideup.vi v8, v11, 1
+; CHECK-NEXT:    vslideup.vi v8, v14, 1
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    vslideup.vi v12, v9, 2
+; CHECK-NEXT:    vslideup.vi v8, v13, 2
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vslideup.vi v8, v12, 4
 ; CHECK-NEXT:    ret
   %y = fpext <8 x bfloat> %x to <8 x float>
   ret <8 x float> %y
@@ -58,33 +56,31 @@ define <8 x float> @fpext_v8f16(<8 x bfloat> %x) {
 ; CHECK-NEXT:    fmv.x.w a6, fa6
 ; CHECK-NEXT:    fmv.x.w a7, fa7
 ; CHECK-NEXT:    slli a7, a7, 16
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vmv.s.x v8, a7
 ; CHECK-NEXT:    slli a6, a6, 16
-; CHECK-NEXT:    vmv.s.x v9, a6
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
 ; CHECK-NEXT:    slli a5, a5, 16
-; CHECK-NEXT:    vmv.s.x v8, a5
 ; CHECK-NEXT:    slli a4, a4, 16
-; CHECK-NEXT:    vmv.s.x v10, a4
-; CHECK-NEXT:    vslideup.vi v10, v8, 1
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v9, 2
 ; CHECK-NEXT:    slli a3, a3, 16
-; CHECK-NEXT:    vmv.s.x v8, a3
 ; CHECK-NEXT:    slli a2, a2, 16
-; CHECK-NEXT:    vmv.s.x v9, a2
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
 ; CHECK-NEXT:    slli a1, a1, 16
-; CHECK-NEXT:    vmv.s.x v11, a1
 ; CHECK-NEXT:    slli a0, a0, 16
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vmv.s.x v8, a7
+; CHECK-NEXT:    vmv.s.x v9, a6
+; CHECK-NEXT:    vmv.s.x v10, a5
+; CHECK-NEXT:    vmv.s.x v12, a4
+; CHECK-NEXT:    vmv.s.x v11, a3
+; CHECK-NEXT:    vmv.s.x v13, a2
+; CHECK-NEXT:    vslideup.vi v9, v8, 1
+; CHECK-NEXT:    vmv.s.x v14, a1
+; CHECK-NEXT:    vslideup.vi v12, v10, 1
+; CHECK-NEXT:    vslideup.vi v13, v11, 1
 ; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    vslideup.vi v8, v11, 1
+; CHECK-NEXT:    vslideup.vi v8, v14, 1
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    vslideup.vi v12, v9, 2
+; CHECK-NEXT:    vslideup.vi v8, v13, 2
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vslideup.vi v8, v12, 4
 ; CHECK-NEXT:    ret
   %y = fpext <8 x bfloat> %x to <8 x float>
   ret <8 x float> %y

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index 5c8be062649f51..03d5762b4903ef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -411,9 +411,9 @@ define <8 x i1> @fcmp_one_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v10, v0.t
@@ -438,9 +438,9 @@ define <8 x i1> @fcmp_one_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v12, v0.t
@@ -492,15 +492,15 @@ define <8 x i1> @fcmp_ord_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -523,15 +523,15 @@ define <8 x i1> @fcmp_ord_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -575,9 +575,9 @@ define <8 x i1> @fcmp_ueq_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v10, v0.t
@@ -602,9 +602,9 @@ define <8 x i1> @fcmp_ueq_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v12, v0.t
@@ -1008,15 +1008,15 @@ define <8 x i1> @fcmp_uno_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -1039,15 +1039,15 @@ define <8 x i1> @fcmp_uno_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -1066,38 +1066,38 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFH-NEXT:    slli a1, a1, 4
 ; ZVFH-NEXT:    sub sp, sp, a1
 ; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFH-NEXT:    addi a1, a0, 128
-; ZVFH-NEXT:    li a3, 64
-; ZVFH-NEXT:    vsetvli zero, a3, e16, m8, ta, ma
-; ZVFH-NEXT:    vle16.v v24, (a1)
 ; ZVFH-NEXT:    csrr a1, vlenb
 ; ZVFH-NEXT:    slli a1, a1, 3
 ; ZVFH-NEXT:    add a1, sp, a1
 ; ZVFH-NEXT:    addi a1, a1, 16
-; ZVFH-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFH-NEXT:    vle16.v v24, (a0)
-; ZVFH-NEXT:    addi a0, sp, 16
-; ZVFH-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; ZVFH-NEXT:    vslidedown.vi v6, v0, 8
+; ZVFH-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFH-NEXT:    addi a1, a0, 128
+; ZVFH-NEXT:    li a3, 64
+; ZVFH-NEXT:    vsetvli zero, a3, e16, m8, ta, ma
+; ZVFH-NEXT:    vle16.v v16, (a1)
+; ZVFH-NEXT:    addi a1, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vle16.v v16, (a0)
 ; ZVFH-NEXT:    mv a0, a2
+; ZVFH-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; ZVFH-NEXT:    vslidedown.vi v24, v0, 8
 ; ZVFH-NEXT:    bltu a2, a3, .LBB43_2
 ; ZVFH-NEXT:  # %bb.1:
 ; ZVFH-NEXT:    li a0, 64
 ; ZVFH-NEXT:  .LBB43_2:
-; ZVFH-NEXT:    addi a1, sp, 16
-; ZVFH-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vmfeq.vv v7, v8, v24, v0.t
+; ZVFH-NEXT:    vmfeq.vv v7, v8, v16, v0.t
 ; ZVFH-NEXT:    addi a0, a2, -64
 ; ZVFH-NEXT:    sltu a1, a2, a0
 ; ZVFH-NEXT:    addi a1, a1, -1
 ; ZVFH-NEXT:    and a0, a1, a0
-; ZVFH-NEXT:    vmv1r.v v0, v6
+; ZVFH-NEXT:    vmv1r.v v0, v24
 ; ZVFH-NEXT:    csrr a1, vlenb
 ; ZVFH-NEXT:    slli a1, a1, 3
 ; ZVFH-NEXT:    add a1, sp, a1
 ; ZVFH-NEXT:    addi a1, a1, 16
+; ZVFH-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFH-NEXT:    addi a1, sp, 16
 ; ZVFH-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v8, v16, v24, v0.t
@@ -1114,1757 +1114,2269 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ;
 ; ZVFHMIN32-LABEL: fcmp_oeq_vv_v128f16:
 ; ZVFHMIN32:       # %bb.0:
-; ZVFHMIN32-NEXT:    addi sp, sp, -768
-; ZVFHMIN32-NEXT:    .cfi_def_cfa_offset 768
-; ZVFHMIN32-NEXT:    sw ra, 764(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    sw s0, 760(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    sw s2, 756(sp) # 4-byte Folded Spill
-; ZVFHMIN32-NEXT:    sw s3, 752(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    addi sp, sp, -896
+; ZVFHMIN32-NEXT:    .cfi_def_cfa_offset 896
+; ZVFHMIN32-NEXT:    sw ra, 892(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s0, 888(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s2, 884(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s3, 880(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s4, 876(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s5, 872(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s6, 868(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s7, 864(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s8, 860(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s9, 856(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s10, 852(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    sw s11, 848(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    .cfi_offset ra, -4
 ; ZVFHMIN32-NEXT:    .cfi_offset s0, -8
 ; ZVFHMIN32-NEXT:    .cfi_offset s2, -12
 ; ZVFHMIN32-NEXT:    .cfi_offset s3, -16
-; ZVFHMIN32-NEXT:    addi s0, sp, 768
+; ZVFHMIN32-NEXT:    .cfi_offset s4, -20
+; ZVFHMIN32-NEXT:    .cfi_offset s5, -24
+; ZVFHMIN32-NEXT:    .cfi_offset s6, -28
+; ZVFHMIN32-NEXT:    .cfi_offset s7, -32
+; ZVFHMIN32-NEXT:    .cfi_offset s8, -36
+; ZVFHMIN32-NEXT:    .cfi_offset s9, -40
+; ZVFHMIN32-NEXT:    .cfi_offset s10, -44
+; ZVFHMIN32-NEXT:    .cfi_offset s11, -48
+; ZVFHMIN32-NEXT:    addi s0, sp, 896
 ; ZVFHMIN32-NEXT:    .cfi_def_cfa s0, 0
+; ZVFHMIN32-NEXT:    csrr a1, vlenb
+; ZVFHMIN32-NEXT:    li a2, 30
+; ZVFHMIN32-NEXT:    mul a1, a1, a2
+; ZVFHMIN32-NEXT:    sub sp, sp, a1
 ; ZVFHMIN32-NEXT:    andi sp, sp, -128
 ; ZVFHMIN32-NEXT:    addi a1, a0, 128
 ; ZVFHMIN32-NEXT:    li a2, 64
+; ZVFHMIN32-NEXT:    addi a3, sp, 640
+; ZVFHMIN32-NEXT:    addi a4, sp, 384
+; ZVFHMIN32-NEXT:    addi a5, sp, 512
 ; ZVFHMIN32-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; ZVFHMIN32-NEXT:    vle16.v v24, (a1)
 ; ZVFHMIN32-NEXT:    vle16.v v0, (a0)
-; ZVFHMIN32-NEXT:    addi a0, sp, 512
-; ZVFHMIN32-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN32-NEXT:    addi a0, sp, 256
-; ZVFHMIN32-NEXT:    vse16.v v0, (a0)
-; ZVFHMIN32-NEXT:    addi a0, sp, 384
-; ZVFHMIN32-NEXT:    vse16.v v16, (a0)
-; ZVFHMIN32-NEXT:    addi a0, sp, 128
+; ZVFHMIN32-NEXT:    vle16.v v24, (a1)
+; ZVFHMIN32-NEXT:    vse16.v v8, (a3)
+; ZVFHMIN32-NEXT:    vse16.v v0, (a4)
+; ZVFHMIN32-NEXT:    vse16.v v16, (a5)
 ; ZVFHMIN32-NEXT:    vse16.v v24, (a0)
+; ZVFHMIN32-NEXT:    lh a0, 704(sp)
+; ZVFHMIN32-NEXT:    lh a1, 448(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 160(sp)
+; ZVFHMIN32-NEXT:    lh a0, 702(sp)
+; ZVFHMIN32-NEXT:    lh a1, 446(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 159(sp)
+; ZVFHMIN32-NEXT:    lh a0, 700(sp)
+; ZVFHMIN32-NEXT:    lh a1, 444(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 158(sp)
+; ZVFHMIN32-NEXT:    lh a0, 698(sp)
+; ZVFHMIN32-NEXT:    lh a1, 442(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 157(sp)
+; ZVFHMIN32-NEXT:    lh a0, 696(sp)
+; ZVFHMIN32-NEXT:    lh a1, 440(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 156(sp)
+; ZVFHMIN32-NEXT:    lh a0, 694(sp)
+; ZVFHMIN32-NEXT:    lh a1, 438(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 155(sp)
+; ZVFHMIN32-NEXT:    lh a0, 692(sp)
+; ZVFHMIN32-NEXT:    lh a1, 436(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 154(sp)
+; ZVFHMIN32-NEXT:    lh a0, 690(sp)
+; ZVFHMIN32-NEXT:    lh a1, 434(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 153(sp)
+; ZVFHMIN32-NEXT:    lh a0, 688(sp)
+; ZVFHMIN32-NEXT:    lh a1, 432(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 152(sp)
+; ZVFHMIN32-NEXT:    lh a0, 686(sp)
+; ZVFHMIN32-NEXT:    lh a1, 430(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 151(sp)
+; ZVFHMIN32-NEXT:    lh a0, 684(sp)
+; ZVFHMIN32-NEXT:    lh a1, 428(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 150(sp)
+; ZVFHMIN32-NEXT:    lh a0, 682(sp)
+; ZVFHMIN32-NEXT:    lh a1, 426(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 149(sp)
+; ZVFHMIN32-NEXT:    lh a0, 680(sp)
+; ZVFHMIN32-NEXT:    lh a1, 424(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 148(sp)
+; ZVFHMIN32-NEXT:    lh a0, 678(sp)
+; ZVFHMIN32-NEXT:    lh a1, 422(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 147(sp)
+; ZVFHMIN32-NEXT:    lh a0, 676(sp)
+; ZVFHMIN32-NEXT:    lh a1, 420(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 146(sp)
+; ZVFHMIN32-NEXT:    lh a0, 674(sp)
+; ZVFHMIN32-NEXT:    lh a1, 418(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa3
+; ZVFHMIN32-NEXT:    sb a0, 145(sp)
+; ZVFHMIN32-NEXT:    lh a0, 672(sp)
+; ZVFHMIN32-NEXT:    lh a1, 416(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a2, 128(sp)
+; ZVFHMIN32-NEXT:    sb a0, 144(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 576(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 320(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 32(sp)
+; ZVFHMIN32-NEXT:    sb a0, 224(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 574(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 318(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 31(sp)
+; ZVFHMIN32-NEXT:    sb a0, 223(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 572(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 316(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 30(sp)
+; ZVFHMIN32-NEXT:    sb a0, 222(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 570(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 314(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 29(sp)
+; ZVFHMIN32-NEXT:    sb a0, 221(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 568(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 312(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 28(sp)
+; ZVFHMIN32-NEXT:    sb a0, 220(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 566(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 310(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 27(sp)
+; ZVFHMIN32-NEXT:    sb a0, 219(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 564(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 308(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 26(sp)
+; ZVFHMIN32-NEXT:    sb a0, 218(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 562(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 306(sp)
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 7
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 29
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 6
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 28
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 5
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 27
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 4
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 26
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 3
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 25
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 2
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 24
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 1
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 23
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v26, v8, 15
+; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 14
+; ZVFHMIN32-NEXT:    vslidedown.vi v28, v8, 13
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 12
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 1
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v4, v8, 11
+; ZVFHMIN32-NEXT:    vslidedown.vi v2, v8, 10
+; ZVFHMIN32-NEXT:    vslidedown.vi v30, v8, 9
+; ZVFHMIN32-NEXT:    vslidedown.vi v22, v8, 8
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v16
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 25(sp)
+; ZVFHMIN32-NEXT:    sb a0, 217(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 560(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 304(sp)
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v3, v16, 7
+; ZVFHMIN32-NEXT:    vslidedown.vi v31, v16, 6
+; ZVFHMIN32-NEXT:    vslidedown.vi v5, v16, 5
+; ZVFHMIN32-NEXT:    vslidedown.vi v23, v16, 4
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 3
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 21
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 2
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 20
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 1
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 22
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v18, v16, 15
+; ZVFHMIN32-NEXT:    vslidedown.vi v14, v16, 14
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 13
+; ZVFHMIN32-NEXT:    vslidedown.vi v12, v16, 12
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 11
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 10
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 18
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 9
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 14
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 24(sp)
+; ZVFHMIN32-NEXT:    sb a0, 216(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 558(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 302(sp)
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v13, v0, 7
+; ZVFHMIN32-NEXT:    vslidedown.vi v29, v0, 6
+; ZVFHMIN32-NEXT:    vslidedown.vi v11, v0, 5
+; ZVFHMIN32-NEXT:    vslidedown.vi v7, v0, 4
+; ZVFHMIN32-NEXT:    vslidedown.vi v9, v0, 3
+; ZVFHMIN32-NEXT:    vslidedown.vi v21, v0, 2
+; ZVFHMIN32-NEXT:    vslidedown.vi v27, v0, 1
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 15
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 2
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 14
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 13
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 6
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 12
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 12
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 11
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 10
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 10
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 4
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 9
+; ZVFHMIN32-NEXT:    vslidedown.vi v0, v0, 8
+; ZVFHMIN32-NEXT:    addi a2, sp, 848
+; ZVFHMIN32-NEXT:    vs2r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s t4, v26
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 23(sp)
+; ZVFHMIN32-NEXT:    sb a0, 215(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 556(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 300(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s t3, v20
+; ZVFHMIN32-NEXT:    vmv.x.s t1, v28
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 22(sp)
+; ZVFHMIN32-NEXT:    sb a0, 214(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 554(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 298(sp)
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 1
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vl2r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s t2, v0
+; ZVFHMIN32-NEXT:    vmv.x.s t0, v4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 21(sp)
+; ZVFHMIN32-NEXT:    sb a0, 213(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 552(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 296(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a7, v2
+; ZVFHMIN32-NEXT:    vmv.x.s a6, v30
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 20(sp)
+; ZVFHMIN32-NEXT:    sb a0, 212(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 550(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 294(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v22
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v18
+; ZVFHMIN32-NEXT:    sw a2, 112(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 19(sp)
-; ZVFHMIN32-NEXT:    lh a0, 548(sp)
-; ZVFHMIN32-NEXT:    lh a1, 292(sp)
+; ZVFHMIN32-NEXT:    sb a0, 211(sp)
+; ZVFHMIN32-NEXT:    lh a1, 548(sp)
+; ZVFHMIN32-NEXT:    lh t5, 292(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v14
+; ZVFHMIN32-NEXT:    sw a0, 116(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN32-NEXT:    sw a0, 124(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 210(sp)
+; ZVFHMIN32-NEXT:    lh a1, 546(sp)
+; ZVFHMIN32-NEXT:    lh t5, 290(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v24
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, t5
+; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa3
+; ZVFHMIN32-NEXT:    sb a1, 209(sp)
+; ZVFHMIN32-NEXT:    lh a1, 544(sp)
+; ZVFHMIN32-NEXT:    lh t5, 288(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a4, 192(sp)
+; ZVFHMIN32-NEXT:    sb a1, 208(sp)
+; ZVFHMIN32-NEXT:    lh t5, 738(sp)
+; ZVFHMIN32-NEXT:    lh t6, 482(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v12
+; ZVFHMIN32-NEXT:    sw a0, 108(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v10
+; ZVFHMIN32-NEXT:    sw a0, 120(sp) # 4-byte Folded Spill
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t5, 177(sp)
+; ZVFHMIN32-NEXT:    lh t5, 736(sp)
+; ZVFHMIN32-NEXT:    lh t6, 480(sp)
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a1, 29
+; ZVFHMIN32-NEXT:    mul a0, a0, a1
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    lh s5, 848(a0) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a1, 28
+; ZVFHMIN32-NEXT:    mul a0, a0, a1
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    lh s6, 848(a0) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t5, 176(sp)
+; ZVFHMIN32-NEXT:    lh t5, 734(sp)
+; ZVFHMIN32-NEXT:    lh t6, 478(sp)
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a1, 27
+; ZVFHMIN32-NEXT:    mul a0, a0, a1
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    lh s7, 848(a0) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a1, 26
+; ZVFHMIN32-NEXT:    mul a0, a0, a1
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    lh s8, 848(a0) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t5, 175(sp)
+; ZVFHMIN32-NEXT:    lh t5, 732(sp)
+; ZVFHMIN32-NEXT:    lh t6, 476(sp)
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a1, 25
+; ZVFHMIN32-NEXT:    mul a0, a0, a1
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    lh s4, 848(a0) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a1, 24
+; ZVFHMIN32-NEXT:    mul a0, a0, a1
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    lh s3, 848(a0) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t5, 174(sp)
+; ZVFHMIN32-NEXT:    lh t6, 730(sp)
+; ZVFHMIN32-NEXT:    lh s9, 474(sp)
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    li a1, 23
+; ZVFHMIN32-NEXT:    mul a0, a0, a1
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    lh s2, 848(a0) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s t5, v3
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s9
+; ZVFHMIN32-NEXT:    feq.h t6, fa5, fa4
+; ZVFHMIN32-NEXT:    sb t6, 173(sp)
+; ZVFHMIN32-NEXT:    lh s9, 728(sp)
+; ZVFHMIN32-NEXT:    lh s10, 472(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s t6, v31
+; ZVFHMIN32-NEXT:    vmv.x.s ra, v13
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
+; ZVFHMIN32-NEXT:    feq.h s9, fa5, fa4
+; ZVFHMIN32-NEXT:    sb s9, 172(sp)
+; ZVFHMIN32-NEXT:    lh s9, 726(sp)
+; ZVFHMIN32-NEXT:    lh s10, 470(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v29
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v11
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
+; ZVFHMIN32-NEXT:    feq.h s9, fa5, fa4
+; ZVFHMIN32-NEXT:    sb s9, 171(sp)
+; ZVFHMIN32-NEXT:    lh s10, 724(sp)
+; ZVFHMIN32-NEXT:    lh s11, 468(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v7
+; ZVFHMIN32-NEXT:    vmv.x.s s9, v9
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s10
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s11
+; ZVFHMIN32-NEXT:    feq.h s10, fa5, fa4
+; ZVFHMIN32-NEXT:    sb s10, 170(sp)
+; ZVFHMIN32-NEXT:    lh a0, 722(sp)
+; ZVFHMIN32-NEXT:    lh a1, 466(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s s10, v21
+; ZVFHMIN32-NEXT:    vmv.x.s s11, v27
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 18(sp)
-; ZVFHMIN32-NEXT:    lh a0, 546(sp)
-; ZVFHMIN32-NEXT:    lh a1, 290(sp)
+; ZVFHMIN32-NEXT:    sb a0, 169(sp)
+; ZVFHMIN32-NEXT:    lh a0, 720(sp)
+; ZVFHMIN32-NEXT:    lh a1, 464(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s5
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s6
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    sb a0, 168(sp)
+; ZVFHMIN32-NEXT:    lh a0, 718(sp)
+; ZVFHMIN32-NEXT:    lh a1, 462(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, s7
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, s8
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa1, fa0
+; ZVFHMIN32-NEXT:    fmv.h.x fa1, ra
+; ZVFHMIN32-NEXT:    sb a0, 167(sp)
+; ZVFHMIN32-NEXT:    lh a0, 716(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa0, a2
+; ZVFHMIN32-NEXT:    lh a1, 460(sp)
+; ZVFHMIN32-NEXT:    feq.h s5, fa5, fa1
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s4
+; ZVFHMIN32-NEXT:    sb a1, 166(sp)
+; ZVFHMIN32-NEXT:    lh a1, 714(sp)
+; ZVFHMIN32-NEXT:    lh a2, 458(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h a3, fa3, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa3
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s3
+; ZVFHMIN32-NEXT:    sb a1, 165(sp)
+; ZVFHMIN32-NEXT:    lh a1, 712(sp)
+; ZVFHMIN32-NEXT:    lh a2, 456(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a4
+; ZVFHMIN32-NEXT:    feq.h a4, fa2, fa3
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa3, fa2
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, s2
+; ZVFHMIN32-NEXT:    sb a1, 164(sp)
+; ZVFHMIN32-NEXT:    lh a1, 710(sp)
+; ZVFHMIN32-NEXT:    lh a2, 454(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, s9
+; ZVFHMIN32-NEXT:    feq.h s2, fa5, fa2
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa2
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s10
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, s11
+; ZVFHMIN32-NEXT:    sb a1, 163(sp)
+; ZVFHMIN32-NEXT:    lh a1, 708(sp)
+; ZVFHMIN32-NEXT:    lh a2, 452(sp)
+; ZVFHMIN32-NEXT:    feq.h s3, fa4, fa5
+; ZVFHMIN32-NEXT:    feq.h s4, fa3, fa2
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a1, 162(sp)
+; ZVFHMIN32-NEXT:    lh a1, 706(sp)
+; ZVFHMIN32-NEXT:    lh a2, 450(sp)
+; ZVFHMIN32-NEXT:    sb s4, 129(sp)
+; ZVFHMIN32-NEXT:    sb s3, 130(sp)
+; ZVFHMIN32-NEXT:    sb s2, 131(sp)
+; ZVFHMIN32-NEXT:    sb a4, 132(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a3, 133(sp)
+; ZVFHMIN32-NEXT:    sb a0, 134(sp)
+; ZVFHMIN32-NEXT:    sb s5, 135(sp)
+; ZVFHMIN32-NEXT:    sb a1, 161(sp)
+; ZVFHMIN32-NEXT:    lh a0, 610(sp)
+; ZVFHMIN32-NEXT:    lh a1, 354(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s s6, v5
+; ZVFHMIN32-NEXT:    vmv.x.s s5, v23
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 17(sp)
-; ZVFHMIN32-NEXT:    lh a0, 544(sp)
-; ZVFHMIN32-NEXT:    lh a1, 288(sp)
+; ZVFHMIN32-NEXT:    sb a0, 241(sp)
+; ZVFHMIN32-NEXT:    lh a0, 608(sp)
+; ZVFHMIN32-NEXT:    lh a1, 352(sp)
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 21
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh s4, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 20
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh s3, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v0
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    sb a0, 240(sp)
+; ZVFHMIN32-NEXT:    lh a0, 606(sp)
+; ZVFHMIN32-NEXT:    lh a1, 350(sp)
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 22
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh s2, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa3
+; ZVFHMIN32-NEXT:    sb a0, 239(sp)
+; ZVFHMIN32-NEXT:    lh a0, 604(sp)
+; ZVFHMIN32-NEXT:    lh a1, 348(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 7
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    sb a0, 238(sp)
+; ZVFHMIN32-NEXT:    lh a0, 602(sp)
+; ZVFHMIN32-NEXT:    lh a1, 346(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 6
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    sb a0, 237(sp)
+; ZVFHMIN32-NEXT:    lh a0, 600(sp)
+; ZVFHMIN32-NEXT:    lh a1, 344(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 5
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    sb a0, 236(sp)
+; ZVFHMIN32-NEXT:    lh a0, 598(sp)
+; ZVFHMIN32-NEXT:    lh a1, 342(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 4
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    sb a0, 235(sp)
+; ZVFHMIN32-NEXT:    lh a0, 596(sp)
+; ZVFHMIN32-NEXT:    lh a1, 340(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s s8, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 3
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    sb a0, 234(sp)
+; ZVFHMIN32-NEXT:    lh a0, 594(sp)
+; ZVFHMIN32-NEXT:    lh a1, 338(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s s9, v8
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 2
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    sb a0, 233(sp)
+; ZVFHMIN32-NEXT:    lh a0, 592(sp)
 ; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN32-NEXT:    lh t5, 336(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 1
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN32-NEXT:    vmv.x.s s7, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, t5
+; ZVFHMIN32-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN32-NEXT:    fmv.h.x fa3, a2
+; ZVFHMIN32-NEXT:    sb a0, 232(sp)
+; ZVFHMIN32-NEXT:    lh a0, 590(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa2, a3
+; ZVFHMIN32-NEXT:    lh a2, 334(sp)
+; ZVFHMIN32-NEXT:    feq.h t5, fa5, fa3
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    feq.h t6, fa4, fa2
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s6
+; ZVFHMIN32-NEXT:    sb a0, 231(sp)
+; ZVFHMIN32-NEXT:    lh a0, 588(sp)
+; ZVFHMIN32-NEXT:    lh a2, 332(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s5
+; ZVFHMIN32-NEXT:    sb a0, 230(sp)
+; ZVFHMIN32-NEXT:    lh a0, 586(sp)
+; ZVFHMIN32-NEXT:    lh a2, 330(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s8
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s4
+; ZVFHMIN32-NEXT:    sb a0, 229(sp)
+; ZVFHMIN32-NEXT:    lh a0, 584(sp)
+; ZVFHMIN32-NEXT:    lh a2, 328(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s9
+; ZVFHMIN32-NEXT:    feq.h s4, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s3
+; ZVFHMIN32-NEXT:    sb a0, 228(sp)
+; ZVFHMIN32-NEXT:    lh a0, 582(sp)
+; ZVFHMIN32-NEXT:    lh a2, 326(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a1, 0(sp)
-; ZVFHMIN32-NEXT:    sb a0, 16(sp)
-; ZVFHMIN32-NEXT:    lh a0, 448(sp)
-; ZVFHMIN32-NEXT:    lh a1, 192(sp)
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s2
+; ZVFHMIN32-NEXT:    sb a0, 227(sp)
+; ZVFHMIN32-NEXT:    lh a0, 580(sp)
+; ZVFHMIN32-NEXT:    lh a2, 324(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s7
+; ZVFHMIN32-NEXT:    feq.h s2, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 226(sp)
+; ZVFHMIN32-NEXT:    lh a0, 578(sp)
+; ZVFHMIN32-NEXT:    lh a2, 322(sp)
+; ZVFHMIN32-NEXT:    sb s2, 193(sp)
+; ZVFHMIN32-NEXT:    sb a1, 194(sp)
+; ZVFHMIN32-NEXT:    sb s4, 195(sp)
+; ZVFHMIN32-NEXT:    sb a4, 196(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a3, 197(sp)
+; ZVFHMIN32-NEXT:    sb t6, 198(sp)
+; ZVFHMIN32-NEXT:    sb t5, 199(sp)
+; ZVFHMIN32-NEXT:    sb a0, 225(sp)
+; ZVFHMIN32-NEXT:    lh a0, 766(sp)
+; ZVFHMIN32-NEXT:    lh a1, 510(sp)
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 18
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s2, v8
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 14
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s t6, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 191(sp)
+; ZVFHMIN32-NEXT:    lh a0, 764(sp)
+; ZVFHMIN32-NEXT:    lh a1, 508(sp)
+; ZVFHMIN32-NEXT:    vmv.x.s t5, v6
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 2
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 96(sp)
-; ZVFHMIN32-NEXT:    lh a0, 446(sp)
-; ZVFHMIN32-NEXT:    lh a1, 190(sp)
+; ZVFHMIN32-NEXT:    sb a0, 190(sp)
+; ZVFHMIN32-NEXT:    lh a0, 762(sp)
+; ZVFHMIN32-NEXT:    lh a1, 506(sp)
+; ZVFHMIN32-NEXT:    csrr a3, vlenb
+; ZVFHMIN32-NEXT:    slli a3, a3, 3
+; ZVFHMIN32-NEXT:    add a3, sp, a3
+; ZVFHMIN32-NEXT:    addi a3, a3, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN32-NEXT:    csrr a4, vlenb
+; ZVFHMIN32-NEXT:    li s3, 6
+; ZVFHMIN32-NEXT:    mul a4, a4, s3
+; ZVFHMIN32-NEXT:    add a4, sp, a4
+; ZVFHMIN32-NEXT:    addi a4, a4, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 95(sp)
-; ZVFHMIN32-NEXT:    lh a0, 444(sp)
-; ZVFHMIN32-NEXT:    lh a1, 188(sp)
+; ZVFHMIN32-NEXT:    sb a0, 189(sp)
+; ZVFHMIN32-NEXT:    lh a0, 760(sp)
+; ZVFHMIN32-NEXT:    lh a1, 504(sp)
+; ZVFHMIN32-NEXT:    csrr s3, vlenb
+; ZVFHMIN32-NEXT:    li s4, 12
+; ZVFHMIN32-NEXT:    mul s3, s3, s4
+; ZVFHMIN32-NEXT:    add s3, sp, s3
+; ZVFHMIN32-NEXT:    addi s3, s3, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s6, v8
+; ZVFHMIN32-NEXT:    csrr s3, vlenb
+; ZVFHMIN32-NEXT:    li s4, 10
+; ZVFHMIN32-NEXT:    mul s3, s3, s4
+; ZVFHMIN32-NEXT:    add s3, sp, s3
+; ZVFHMIN32-NEXT:    addi s3, s3, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s4, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 94(sp)
-; ZVFHMIN32-NEXT:    lh a0, 442(sp)
-; ZVFHMIN32-NEXT:    lh a1, 186(sp)
+; ZVFHMIN32-NEXT:    sb a0, 188(sp)
+; ZVFHMIN32-NEXT:    lh a0, 758(sp)
+; ZVFHMIN32-NEXT:    lh a1, 502(sp)
+; ZVFHMIN32-NEXT:    csrr s3, vlenb
+; ZVFHMIN32-NEXT:    slli s3, s3, 4
+; ZVFHMIN32-NEXT:    add s3, sp, s3
+; ZVFHMIN32-NEXT:    addi s3, s3, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s5, v8
+; ZVFHMIN32-NEXT:    vmv.x.s s3, v16
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 93(sp)
-; ZVFHMIN32-NEXT:    lh a0, 440(sp)
-; ZVFHMIN32-NEXT:    lh a1, 184(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN32-NEXT:    sb a0, 187(sp)
+; ZVFHMIN32-NEXT:    lh a0, 756(sp)
+; ZVFHMIN32-NEXT:    lh a1, 500(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN32-NEXT:    feq.h t4, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 92(sp)
-; ZVFHMIN32-NEXT:    lh a0, 438(sp)
-; ZVFHMIN32-NEXT:    lh a1, 182(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t3
+; ZVFHMIN32-NEXT:    sb a0, 186(sp)
+; ZVFHMIN32-NEXT:    lh a0, 754(sp)
+; ZVFHMIN32-NEXT:    lh a1, 498(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    feq.h t3, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 91(sp)
-; ZVFHMIN32-NEXT:    lh a0, 436(sp)
-; ZVFHMIN32-NEXT:    lh a1, 180(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t1
+; ZVFHMIN32-NEXT:    sb a0, 185(sp)
+; ZVFHMIN32-NEXT:    lh a0, 752(sp)
+; ZVFHMIN32-NEXT:    lh a1, 496(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN32-NEXT:    feq.h t1, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 90(sp)
-; ZVFHMIN32-NEXT:    lh a0, 434(sp)
-; ZVFHMIN32-NEXT:    lh a1, 178(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN32-NEXT:    sb a0, 184(sp)
+; ZVFHMIN32-NEXT:    lh a0, 750(sp)
+; ZVFHMIN32-NEXT:    lh a1, 494(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s6
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 89(sp)
-; ZVFHMIN32-NEXT:    lh a0, 432(sp)
-; ZVFHMIN32-NEXT:    lh a1, 176(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
+; ZVFHMIN32-NEXT:    sb a0, 183(sp)
+; ZVFHMIN32-NEXT:    lh a0, 748(sp)
+; ZVFHMIN32-NEXT:    lh a1, 492(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s4
+; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 88(sp)
-; ZVFHMIN32-NEXT:    lh a0, 430(sp)
-; ZVFHMIN32-NEXT:    lh a1, 174(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a7
+; ZVFHMIN32-NEXT:    sb a0, 182(sp)
+; ZVFHMIN32-NEXT:    lh a0, 746(sp)
+; ZVFHMIN32-NEXT:    lh a1, 490(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s5
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 87(sp)
-; ZVFHMIN32-NEXT:    lh a0, 428(sp)
-; ZVFHMIN32-NEXT:    lh a1, 172(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
+; ZVFHMIN32-NEXT:    sb a0, 181(sp)
+; ZVFHMIN32-NEXT:    lh a0, 744(sp)
+; ZVFHMIN32-NEXT:    lh a1, 488(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, s3
+; ZVFHMIN32-NEXT:    feq.h a6, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 86(sp)
-; ZVFHMIN32-NEXT:    lh a0, 426(sp)
-; ZVFHMIN32-NEXT:    lh a1, 170(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a5
+; ZVFHMIN32-NEXT:    addi a1, sp, 848
+; ZVFHMIN32-NEXT:    vl2r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 15
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v8
+; ZVFHMIN32-NEXT:    sb a0, 180(sp)
+; ZVFHMIN32-NEXT:    lh a0, 742(sp)
+; ZVFHMIN32-NEXT:    lh a7, 486(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 179(sp)
+; ZVFHMIN32-NEXT:    lh a0, 740(sp)
+; ZVFHMIN32-NEXT:    lh a7, 484(sp)
+; ZVFHMIN32-NEXT:    sb a2, 140(sp)
+; ZVFHMIN32-NEXT:    sb t1, 141(sp)
+; ZVFHMIN32-NEXT:    sb t3, 142(sp)
+; ZVFHMIN32-NEXT:    sb t4, 143(sp)
+; ZVFHMIN32-NEXT:    sb a1, 136(sp)
+; ZVFHMIN32-NEXT:    sb a6, 137(sp)
+; ZVFHMIN32-NEXT:    sb a4, 138(sp)
+; ZVFHMIN32-NEXT:    sb a3, 139(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 178(sp)
+; ZVFHMIN32-NEXT:    lh a0, 638(sp)
+; ZVFHMIN32-NEXT:    lh a1, 382(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 14
+; ZVFHMIN32-NEXT:    vmv.x.s t3, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 85(sp)
-; ZVFHMIN32-NEXT:    lh a0, 424(sp)
-; ZVFHMIN32-NEXT:    lh a1, 168(sp)
+; ZVFHMIN32-NEXT:    sb a0, 255(sp)
+; ZVFHMIN32-NEXT:    lh a0, 636(sp)
+; ZVFHMIN32-NEXT:    lh a1, 380(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 13
+; ZVFHMIN32-NEXT:    vmv.x.s t2, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 84(sp)
-; ZVFHMIN32-NEXT:    lh a0, 422(sp)
-; ZVFHMIN32-NEXT:    lh a1, 166(sp)
+; ZVFHMIN32-NEXT:    sb a0, 254(sp)
+; ZVFHMIN32-NEXT:    lh a0, 634(sp)
+; ZVFHMIN32-NEXT:    lh a1, 378(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 12
+; ZVFHMIN32-NEXT:    vmv.x.s t1, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 83(sp)
-; ZVFHMIN32-NEXT:    lh a0, 420(sp)
-; ZVFHMIN32-NEXT:    lh a1, 164(sp)
+; ZVFHMIN32-NEXT:    sb a0, 253(sp)
+; ZVFHMIN32-NEXT:    lh a0, 632(sp)
+; ZVFHMIN32-NEXT:    lh a1, 376(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 11
+; ZVFHMIN32-NEXT:    vmv.x.s t0, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 82(sp)
-; ZVFHMIN32-NEXT:    lh a0, 418(sp)
-; ZVFHMIN32-NEXT:    lh a1, 162(sp)
+; ZVFHMIN32-NEXT:    sb a0, 252(sp)
+; ZVFHMIN32-NEXT:    lh a0, 630(sp)
+; ZVFHMIN32-NEXT:    lh a1, 374(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 10
+; ZVFHMIN32-NEXT:    vmv.x.s a7, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 81(sp)
-; ZVFHMIN32-NEXT:    lh a0, 416(sp)
-; ZVFHMIN32-NEXT:    lh a1, 160(sp)
+; ZVFHMIN32-NEXT:    sb a0, 251(sp)
+; ZVFHMIN32-NEXT:    lh a0, 628(sp)
+; ZVFHMIN32-NEXT:    lh a1, 372(sp)
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 9
+; ZVFHMIN32-NEXT:    vmv.x.s a6, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v24
+; ZVFHMIN32-NEXT:    lw a1, 112(sp) # 4-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v16
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a1, 64(sp)
-; ZVFHMIN32-NEXT:    sb a0, 80(sp)
-; ZVFHMIN32-NEXT:    lh a0, 610(sp)
-; ZVFHMIN32-NEXT:    lh a1, 354(sp)
+; ZVFHMIN32-NEXT:    sb a0, 250(sp)
+; ZVFHMIN32-NEXT:    lh a0, 626(sp)
+; ZVFHMIN32-NEXT:    lh a1, 370(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 49(sp)
-; ZVFHMIN32-NEXT:    lh a0, 608(sp)
-; ZVFHMIN32-NEXT:    lh a1, 352(sp)
+; ZVFHMIN32-NEXT:    lw a1, 116(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    sb a0, 249(sp)
+; ZVFHMIN32-NEXT:    lh a0, 624(sp)
+; ZVFHMIN32-NEXT:    lh a1, 368(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 48(sp)
-; ZVFHMIN32-NEXT:    lh a0, 606(sp)
-; ZVFHMIN32-NEXT:    lh a1, 350(sp)
+; ZVFHMIN32-NEXT:    lw a1, 124(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    sb a0, 248(sp)
+; ZVFHMIN32-NEXT:    lh a0, 622(sp)
+; ZVFHMIN32-NEXT:    lh a1, 366(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 47(sp)
-; ZVFHMIN32-NEXT:    lh a1, 604(sp)
-; ZVFHMIN32-NEXT:    lh a2, 348(sp)
-; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 7
-; ZVFHMIN32-NEXT:    vmv.x.s a0, v10
+; ZVFHMIN32-NEXT:    lw a1, 108(sp) # 4-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 46(sp)
-; ZVFHMIN32-NEXT:    lh a2, 602(sp)
-; ZVFHMIN32-NEXT:    lh a3, 346(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 7
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a2, 45(sp)
-; ZVFHMIN32-NEXT:    lh a3, 600(sp)
-; ZVFHMIN32-NEXT:    lh a4, 344(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 6
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a3, 44(sp)
-; ZVFHMIN32-NEXT:    lh a4, 598(sp)
-; ZVFHMIN32-NEXT:    lh a5, 342(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 6
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a4, 43(sp)
-; ZVFHMIN32-NEXT:    lh a5, 596(sp)
-; ZVFHMIN32-NEXT:    lh a6, 340(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 5
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a5
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a6
-; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a5, 42(sp)
-; ZVFHMIN32-NEXT:    lh a6, 594(sp)
-; ZVFHMIN32-NEXT:    lh a7, 338(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 5
-; ZVFHMIN32-NEXT:    vmv.x.s a5, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN32-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a6, 41(sp)
-; ZVFHMIN32-NEXT:    lh a7, 592(sp)
-; ZVFHMIN32-NEXT:    lh t0, 336(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 4
-; ZVFHMIN32-NEXT:    vmv.x.s a6, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a7
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t0
-; ZVFHMIN32-NEXT:    feq.h a7, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a7, 40(sp)
-; ZVFHMIN32-NEXT:    lh t0, 590(sp)
-; ZVFHMIN32-NEXT:    lh t1, 334(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 4
-; ZVFHMIN32-NEXT:    vmv.x.s a7, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT:    feq.h t0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t0, 39(sp)
-; ZVFHMIN32-NEXT:    lh t1, 588(sp)
-; ZVFHMIN32-NEXT:    lh t2, 332(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 3
-; ZVFHMIN32-NEXT:    vmv.x.s t0, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
-; ZVFHMIN32-NEXT:    feq.h t1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t1, 38(sp)
-; ZVFHMIN32-NEXT:    lh t2, 586(sp)
-; ZVFHMIN32-NEXT:    lh t3, 330(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 3
-; ZVFHMIN32-NEXT:    vmv.x.s t1, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t2, 37(sp)
-; ZVFHMIN32-NEXT:    lh t2, 584(sp)
-; ZVFHMIN32-NEXT:    lh t3, 328(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 2
-; ZVFHMIN32-NEXT:    vmv.x.s t4, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t2, 36(sp)
-; ZVFHMIN32-NEXT:    lh t2, 582(sp)
-; ZVFHMIN32-NEXT:    lh t3, 326(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 2
-; ZVFHMIN32-NEXT:    vmv.x.s t5, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t2, 35(sp)
-; ZVFHMIN32-NEXT:    lh t2, 580(sp)
-; ZVFHMIN32-NEXT:    lh t3, 324(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 1
-; ZVFHMIN32-NEXT:    vmv.x.s t6, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t2, 34(sp)
-; ZVFHMIN32-NEXT:    lh t2, 578(sp)
-; ZVFHMIN32-NEXT:    lh t3, 322(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 1
-; ZVFHMIN32-NEXT:    vmv.x.s s2, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a2, 5(sp)
-; ZVFHMIN32-NEXT:    sb a1, 6(sp)
-; ZVFHMIN32-NEXT:    sb a0, 7(sp)
-; ZVFHMIN32-NEXT:    sb t2, 33(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
-; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s2
-; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a3, 1(sp)
-; ZVFHMIN32-NEXT:    sb a2, 2(sp)
-; ZVFHMIN32-NEXT:    sb a1, 3(sp)
-; ZVFHMIN32-NEXT:    sb a0, 4(sp)
-; ZVFHMIN32-NEXT:    lh a0, 482(sp)
-; ZVFHMIN32-NEXT:    lh a1, 226(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 113(sp)
-; ZVFHMIN32-NEXT:    lh a0, 480(sp)
-; ZVFHMIN32-NEXT:    lh a1, 224(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 112(sp)
-; ZVFHMIN32-NEXT:    lh a0, 478(sp)
-; ZVFHMIN32-NEXT:    lh a1, 222(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 111(sp)
-; ZVFHMIN32-NEXT:    lh a1, 476(sp)
-; ZVFHMIN32-NEXT:    lh a2, 220(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 7
-; ZVFHMIN32-NEXT:    vmv.x.s a0, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a1, 110(sp)
-; ZVFHMIN32-NEXT:    lh a2, 474(sp)
-; ZVFHMIN32-NEXT:    lh a3, 218(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 7
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a2, 109(sp)
-; ZVFHMIN32-NEXT:    lh a3, 472(sp)
-; ZVFHMIN32-NEXT:    lh a4, 216(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 6
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a3, 108(sp)
-; ZVFHMIN32-NEXT:    lh a4, 470(sp)
-; ZVFHMIN32-NEXT:    lh a5, 214(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 6
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a4, 107(sp)
-; ZVFHMIN32-NEXT:    lh a5, 468(sp)
-; ZVFHMIN32-NEXT:    lh a6, 212(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 5
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a5
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a6
-; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a5, 106(sp)
-; ZVFHMIN32-NEXT:    lh a6, 466(sp)
-; ZVFHMIN32-NEXT:    lh a7, 210(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 5
-; ZVFHMIN32-NEXT:    vmv.x.s a5, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN32-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a6, 105(sp)
-; ZVFHMIN32-NEXT:    lh a7, 464(sp)
-; ZVFHMIN32-NEXT:    lh t0, 208(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 4
-; ZVFHMIN32-NEXT:    vmv.x.s a6, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a7
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t0
-; ZVFHMIN32-NEXT:    feq.h a7, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a7, 104(sp)
-; ZVFHMIN32-NEXT:    lh t0, 462(sp)
-; ZVFHMIN32-NEXT:    lh t1, 206(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 4
-; ZVFHMIN32-NEXT:    vmv.x.s a7, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT:    feq.h t0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t0, 103(sp)
-; ZVFHMIN32-NEXT:    lh t1, 460(sp)
-; ZVFHMIN32-NEXT:    lh t2, 204(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 3
-; ZVFHMIN32-NEXT:    vmv.x.s t0, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t1
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t2
-; ZVFHMIN32-NEXT:    feq.h t1, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t1, 102(sp)
-; ZVFHMIN32-NEXT:    lh t2, 458(sp)
-; ZVFHMIN32-NEXT:    lh t3, 202(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 3
-; ZVFHMIN32-NEXT:    vmv.x.s t1, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t2, 101(sp)
-; ZVFHMIN32-NEXT:    lh t2, 456(sp)
-; ZVFHMIN32-NEXT:    lh t3, 200(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 2
-; ZVFHMIN32-NEXT:    vmv.x.s t4, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t2, 100(sp)
-; ZVFHMIN32-NEXT:    lh t2, 454(sp)
-; ZVFHMIN32-NEXT:    lh t3, 198(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 2
-; ZVFHMIN32-NEXT:    vmv.x.s t5, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t2, 99(sp)
-; ZVFHMIN32-NEXT:    lh t2, 452(sp)
-; ZVFHMIN32-NEXT:    lh t3, 196(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 1
-; ZVFHMIN32-NEXT:    vmv.x.s t6, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb t2, 98(sp)
-; ZVFHMIN32-NEXT:    lh t2, 450(sp)
-; ZVFHMIN32-NEXT:    lh t3, 194(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 1
-; ZVFHMIN32-NEXT:    vmv.x.s s2, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a2, 69(sp)
-; ZVFHMIN32-NEXT:    sb a1, 70(sp)
-; ZVFHMIN32-NEXT:    sb a0, 71(sp)
-; ZVFHMIN32-NEXT:    sb t2, 97(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
-; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s2
-; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a3, 65(sp)
-; ZVFHMIN32-NEXT:    sb a2, 66(sp)
-; ZVFHMIN32-NEXT:    sb a1, 67(sp)
-; ZVFHMIN32-NEXT:    sb a0, 68(sp)
-; ZVFHMIN32-NEXT:    lh a0, 638(sp)
-; ZVFHMIN32-NEXT:    lh a1, 382(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 63(sp)
-; ZVFHMIN32-NEXT:    lh a0, 636(sp)
-; ZVFHMIN32-NEXT:    lh a1, 380(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 62(sp)
-; ZVFHMIN32-NEXT:    lh a0, 634(sp)
-; ZVFHMIN32-NEXT:    lh a1, 378(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 61(sp)
-; ZVFHMIN32-NEXT:    lh a0, 632(sp)
-; ZVFHMIN32-NEXT:    lh a1, 376(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 60(sp)
-; ZVFHMIN32-NEXT:    lh a0, 630(sp)
-; ZVFHMIN32-NEXT:    lh a1, 374(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 59(sp)
-; ZVFHMIN32-NEXT:    lh a0, 628(sp)
-; ZVFHMIN32-NEXT:    lh a1, 372(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 58(sp)
-; ZVFHMIN32-NEXT:    lh a0, 626(sp)
-; ZVFHMIN32-NEXT:    lh a1, 370(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 57(sp)
-; ZVFHMIN32-NEXT:    lh a0, 624(sp)
-; ZVFHMIN32-NEXT:    lh a1, 368(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 56(sp)
-; ZVFHMIN32-NEXT:    lh a0, 622(sp)
-; ZVFHMIN32-NEXT:    lh a1, 366(sp)
-; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v20, v0, 15
-; ZVFHMIN32-NEXT:    vslidedown.vi v22, v0, 14
-; ZVFHMIN32-NEXT:    vslidedown.vi v26, v0, 13
-; ZVFHMIN32-NEXT:    vslidedown.vi v28, v0, 12
-; ZVFHMIN32-NEXT:    vslidedown.vi v18, v0, 11
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v0, 10
-; ZVFHMIN32-NEXT:    vslidedown.vi v12, v0, 9
-; ZVFHMIN32-NEXT:    vslidedown.vi v14, v0, 8
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v20
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 55(sp)
+; ZVFHMIN32-NEXT:    sb a0, 247(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 620(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 364(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 15
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v20
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v22
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
+; ZVFHMIN32-NEXT:    feq.h a5, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 54(sp)
+; ZVFHMIN32-NEXT:    lw a1, 120(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN32-NEXT:    sb a0, 246(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 618(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 362(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 14
-; ZVFHMIN32-NEXT:    vmv.x.s a5, v20
-; ZVFHMIN32-NEXT:    vmv.x.s a6, v26
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, t0
+; ZVFHMIN32-NEXT:    feq.h t0, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 53(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, s2
+; ZVFHMIN32-NEXT:    sb a0, 245(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 616(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 360(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 13
-; ZVFHMIN32-NEXT:    vmv.x.s a7, v20
-; ZVFHMIN32-NEXT:    vmv.x.s t0, v28
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN32-NEXT:    feq.h a7, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 52(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
+; ZVFHMIN32-NEXT:    sb a0, 244(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 614(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 358(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 12
-; ZVFHMIN32-NEXT:    vmv.x.s t1, v20
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a6
+; ZVFHMIN32-NEXT:    feq.h a6, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 51(sp)
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 8
+; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN32-NEXT:    sb a0, 243(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 612(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 356(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t2, v18
-; ZVFHMIN32-NEXT:    vslidedown.vi v18, v8, 11
-; ZVFHMIN32-NEXT:    vmv.x.s t3, v18
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 50(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a3, 12(sp)
-; ZVFHMIN32-NEXT:    sb a2, 13(sp)
-; ZVFHMIN32-NEXT:    sb a1, 14(sp)
-; ZVFHMIN32-NEXT:    sb a0, 15(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 10
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v12
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 9
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v14
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v8, 8
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a3, 8(sp)
-; ZVFHMIN32-NEXT:    sb a2, 9(sp)
-; ZVFHMIN32-NEXT:    sb a1, 10(sp)
-; ZVFHMIN32-NEXT:    sb a0, 11(sp)
-; ZVFHMIN32-NEXT:    lh a0, 510(sp)
-; ZVFHMIN32-NEXT:    lh a1, 254(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 127(sp)
-; ZVFHMIN32-NEXT:    lh a0, 508(sp)
-; ZVFHMIN32-NEXT:    lh a1, 252(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 126(sp)
-; ZVFHMIN32-NEXT:    lh a0, 506(sp)
-; ZVFHMIN32-NEXT:    lh a1, 250(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 125(sp)
-; ZVFHMIN32-NEXT:    lh a0, 504(sp)
-; ZVFHMIN32-NEXT:    lh a1, 248(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 124(sp)
-; ZVFHMIN32-NEXT:    lh a0, 502(sp)
-; ZVFHMIN32-NEXT:    lh a1, 246(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 123(sp)
-; ZVFHMIN32-NEXT:    lh a0, 500(sp)
-; ZVFHMIN32-NEXT:    lh a1, 244(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 122(sp)
-; ZVFHMIN32-NEXT:    lh a0, 498(sp)
-; ZVFHMIN32-NEXT:    lh a1, 242(sp)
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 15
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v24, 14
-; ZVFHMIN32-NEXT:    vslidedown.vi v12, v24, 13
-; ZVFHMIN32-NEXT:    vslidedown.vi v14, v24, 12
-; ZVFHMIN32-NEXT:    vslidedown.vi v18, v24, 11
-; ZVFHMIN32-NEXT:    vslidedown.vi v20, v24, 10
-; ZVFHMIN32-NEXT:    vslidedown.vi v22, v24, 9
-; ZVFHMIN32-NEXT:    vslidedown.vi v24, v24, 8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a0, 121(sp)
-; ZVFHMIN32-NEXT:    lh a2, 496(sp)
-; ZVFHMIN32-NEXT:    lh a3, 240(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a0, v8
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 15
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN32-NEXT:    sb a5, 204(sp)
+; ZVFHMIN32-NEXT:    sb a4, 205(sp)
+; ZVFHMIN32-NEXT:    sb a2, 206(sp)
+; ZVFHMIN32-NEXT:    sb a3, 207(sp)
 ; ZVFHMIN32-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a2, 120(sp)
-; ZVFHMIN32-NEXT:    lh a4, 494(sp)
-; ZVFHMIN32-NEXT:    lh a5, 238(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 14
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a4, 119(sp)
-; ZVFHMIN32-NEXT:    lh a4, 492(sp)
-; ZVFHMIN32-NEXT:    lh a5, 236(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a6, v12
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 13
-; ZVFHMIN32-NEXT:    vmv.x.s a7, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a4, 118(sp)
-; ZVFHMIN32-NEXT:    lh a4, 490(sp)
-; ZVFHMIN32-NEXT:    lh a5, 234(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t0, v14
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 12
-; ZVFHMIN32-NEXT:    vmv.x.s t1, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a4, 117(sp)
-; ZVFHMIN32-NEXT:    lh a4, 488(sp)
-; ZVFHMIN32-NEXT:    lh a5, 232(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t2, v18
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 11
-; ZVFHMIN32-NEXT:    vmv.x.s t3, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a4, 116(sp)
-; ZVFHMIN32-NEXT:    lh a4, 486(sp)
-; ZVFHMIN32-NEXT:    lh a5, 230(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t4, v20
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 10
-; ZVFHMIN32-NEXT:    vmv.x.s t5, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a4, 115(sp)
-; ZVFHMIN32-NEXT:    lh a4, 484(sp)
-; ZVFHMIN32-NEXT:    lh a5, 228(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t6, v22
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 9
-; ZVFHMIN32-NEXT:    vmv.x.s s2, v8
-; ZVFHMIN32-NEXT:    vmv.x.s s3, v24
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN32-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN32-NEXT:    sb a4, 114(sp)
+; ZVFHMIN32-NEXT:    sb a2, 200(sp)
+; ZVFHMIN32-NEXT:    sb a6, 201(sp)
+; ZVFHMIN32-NEXT:    sb a7, 202(sp)
+; ZVFHMIN32-NEXT:    sb t0, 203(sp)
+; ZVFHMIN32-NEXT:    li a2, 128
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a3, 76(sp)
-; ZVFHMIN32-NEXT:    sb a2, 77(sp)
-; ZVFHMIN32-NEXT:    sb a1, 78(sp)
-; ZVFHMIN32-NEXT:    sb a0, 79(sp)
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN32-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, t5
-; ZVFHMIN32-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, s2
-; ZVFHMIN32-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, s3
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 8
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN32-NEXT:    sb a3, 72(sp)
-; ZVFHMIN32-NEXT:    sb a2, 73(sp)
-; ZVFHMIN32-NEXT:    sb a1, 74(sp)
-; ZVFHMIN32-NEXT:    sb a0, 75(sp)
-; ZVFHMIN32-NEXT:    li a0, 128
-; ZVFHMIN32-NEXT:    mv a1, sp
-; ZVFHMIN32-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; ZVFHMIN32-NEXT:    vle8.v v8, (a1)
+; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN32-NEXT:    sb a0, 242(sp)
+; ZVFHMIN32-NEXT:    addi a0, sp, 128
+; ZVFHMIN32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; ZVFHMIN32-NEXT:    vle8.v v8, (a0)
 ; ZVFHMIN32-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN32-NEXT:    vmsne.vi v0, v8, 0
-; ZVFHMIN32-NEXT:    addi sp, s0, -768
-; ZVFHMIN32-NEXT:    .cfi_def_cfa sp, 768
-; ZVFHMIN32-NEXT:    lw ra, 764(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    lw s0, 760(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    lw s2, 756(sp) # 4-byte Folded Reload
-; ZVFHMIN32-NEXT:    lw s3, 752(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    addi sp, s0, -896
+; ZVFHMIN32-NEXT:    .cfi_def_cfa sp, 896
+; ZVFHMIN32-NEXT:    lw ra, 892(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s0, 888(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s2, 884(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s3, 880(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s4, 876(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s5, 872(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s6, 868(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s7, 864(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s8, 860(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s9, 856(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s10, 852(sp) # 4-byte Folded Reload
+; ZVFHMIN32-NEXT:    lw s11, 848(sp) # 4-byte Folded Reload
 ; ZVFHMIN32-NEXT:    .cfi_restore ra
 ; ZVFHMIN32-NEXT:    .cfi_restore s0
 ; ZVFHMIN32-NEXT:    .cfi_restore s2
 ; ZVFHMIN32-NEXT:    .cfi_restore s3
-; ZVFHMIN32-NEXT:    addi sp, sp, 768
+; ZVFHMIN32-NEXT:    .cfi_restore s4
+; ZVFHMIN32-NEXT:    .cfi_restore s5
+; ZVFHMIN32-NEXT:    .cfi_restore s6
+; ZVFHMIN32-NEXT:    .cfi_restore s7
+; ZVFHMIN32-NEXT:    .cfi_restore s8
+; ZVFHMIN32-NEXT:    .cfi_restore s9
+; ZVFHMIN32-NEXT:    .cfi_restore s10
+; ZVFHMIN32-NEXT:    .cfi_restore s11
+; ZVFHMIN32-NEXT:    addi sp, sp, 896
 ; ZVFHMIN32-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN32-NEXT:    ret
 ;
 ; ZVFHMIN64-LABEL: fcmp_oeq_vv_v128f16:
 ; ZVFHMIN64:       # %bb.0:
-; ZVFHMIN64-NEXT:    addi sp, sp, -768
-; ZVFHMIN64-NEXT:    .cfi_def_cfa_offset 768
-; ZVFHMIN64-NEXT:    sd ra, 760(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    sd s0, 752(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    sd s2, 744(sp) # 8-byte Folded Spill
-; ZVFHMIN64-NEXT:    sd s3, 736(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    addi sp, sp, -896
+; ZVFHMIN64-NEXT:    .cfi_def_cfa_offset 896
+; ZVFHMIN64-NEXT:    sd ra, 888(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s0, 880(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s2, 872(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s3, 864(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s4, 856(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s5, 848(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s6, 840(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s7, 832(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s8, 824(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s9, 816(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s10, 808(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    sd s11, 800(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    .cfi_offset ra, -8
 ; ZVFHMIN64-NEXT:    .cfi_offset s0, -16
 ; ZVFHMIN64-NEXT:    .cfi_offset s2, -24
 ; ZVFHMIN64-NEXT:    .cfi_offset s3, -32
-; ZVFHMIN64-NEXT:    addi s0, sp, 768
+; ZVFHMIN64-NEXT:    .cfi_offset s4, -40
+; ZVFHMIN64-NEXT:    .cfi_offset s5, -48
+; ZVFHMIN64-NEXT:    .cfi_offset s6, -56
+; ZVFHMIN64-NEXT:    .cfi_offset s7, -64
+; ZVFHMIN64-NEXT:    .cfi_offset s8, -72
+; ZVFHMIN64-NEXT:    .cfi_offset s9, -80
+; ZVFHMIN64-NEXT:    .cfi_offset s10, -88
+; ZVFHMIN64-NEXT:    .cfi_offset s11, -96
+; ZVFHMIN64-NEXT:    addi s0, sp, 896
 ; ZVFHMIN64-NEXT:    .cfi_def_cfa s0, 0
+; ZVFHMIN64-NEXT:    csrr a1, vlenb
+; ZVFHMIN64-NEXT:    li a2, 30
+; ZVFHMIN64-NEXT:    mul a1, a1, a2
+; ZVFHMIN64-NEXT:    sub sp, sp, a1
 ; ZVFHMIN64-NEXT:    andi sp, sp, -128
 ; ZVFHMIN64-NEXT:    addi a1, a0, 128
 ; ZVFHMIN64-NEXT:    li a2, 64
+; ZVFHMIN64-NEXT:    addi a3, sp, 640
+; ZVFHMIN64-NEXT:    addi a4, sp, 384
+; ZVFHMIN64-NEXT:    addi a5, sp, 512
 ; ZVFHMIN64-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; ZVFHMIN64-NEXT:    vle16.v v24, (a1)
 ; ZVFHMIN64-NEXT:    vle16.v v0, (a0)
-; ZVFHMIN64-NEXT:    addi a0, sp, 512
-; ZVFHMIN64-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN64-NEXT:    addi a0, sp, 256
-; ZVFHMIN64-NEXT:    vse16.v v0, (a0)
-; ZVFHMIN64-NEXT:    addi a0, sp, 384
-; ZVFHMIN64-NEXT:    vse16.v v16, (a0)
-; ZVFHMIN64-NEXT:    addi a0, sp, 128
+; ZVFHMIN64-NEXT:    vle16.v v24, (a1)
+; ZVFHMIN64-NEXT:    vse16.v v8, (a3)
+; ZVFHMIN64-NEXT:    vse16.v v0, (a4)
+; ZVFHMIN64-NEXT:    vse16.v v16, (a5)
 ; ZVFHMIN64-NEXT:    vse16.v v24, (a0)
+; ZVFHMIN64-NEXT:    lh a0, 704(sp)
+; ZVFHMIN64-NEXT:    lh a1, 448(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 160(sp)
+; ZVFHMIN64-NEXT:    lh a0, 702(sp)
+; ZVFHMIN64-NEXT:    lh a1, 446(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 159(sp)
+; ZVFHMIN64-NEXT:    lh a0, 700(sp)
+; ZVFHMIN64-NEXT:    lh a1, 444(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 158(sp)
+; ZVFHMIN64-NEXT:    lh a0, 698(sp)
+; ZVFHMIN64-NEXT:    lh a1, 442(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 157(sp)
+; ZVFHMIN64-NEXT:    lh a0, 696(sp)
+; ZVFHMIN64-NEXT:    lh a1, 440(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 156(sp)
+; ZVFHMIN64-NEXT:    lh a0, 694(sp)
+; ZVFHMIN64-NEXT:    lh a1, 438(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 155(sp)
+; ZVFHMIN64-NEXT:    lh a0, 692(sp)
+; ZVFHMIN64-NEXT:    lh a1, 436(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 154(sp)
+; ZVFHMIN64-NEXT:    lh a0, 690(sp)
+; ZVFHMIN64-NEXT:    lh a1, 434(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 153(sp)
+; ZVFHMIN64-NEXT:    lh a0, 688(sp)
+; ZVFHMIN64-NEXT:    lh a1, 432(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 152(sp)
+; ZVFHMIN64-NEXT:    lh a0, 686(sp)
+; ZVFHMIN64-NEXT:    lh a1, 430(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 151(sp)
+; ZVFHMIN64-NEXT:    lh a0, 684(sp)
+; ZVFHMIN64-NEXT:    lh a1, 428(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 150(sp)
+; ZVFHMIN64-NEXT:    lh a0, 682(sp)
+; ZVFHMIN64-NEXT:    lh a1, 426(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 149(sp)
+; ZVFHMIN64-NEXT:    lh a0, 680(sp)
+; ZVFHMIN64-NEXT:    lh a1, 424(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 148(sp)
+; ZVFHMIN64-NEXT:    lh a0, 678(sp)
+; ZVFHMIN64-NEXT:    lh a1, 422(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 147(sp)
+; ZVFHMIN64-NEXT:    lh a0, 676(sp)
+; ZVFHMIN64-NEXT:    lh a1, 420(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 146(sp)
+; ZVFHMIN64-NEXT:    lh a0, 674(sp)
+; ZVFHMIN64-NEXT:    lh a1, 418(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa3
+; ZVFHMIN64-NEXT:    sb a0, 145(sp)
+; ZVFHMIN64-NEXT:    lh a0, 672(sp)
+; ZVFHMIN64-NEXT:    lh a1, 416(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a2, 128(sp)
+; ZVFHMIN64-NEXT:    sb a0, 144(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 576(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 320(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 32(sp)
+; ZVFHMIN64-NEXT:    sb a0, 224(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 574(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 318(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 31(sp)
+; ZVFHMIN64-NEXT:    sb a0, 223(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 572(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 316(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 30(sp)
+; ZVFHMIN64-NEXT:    sb a0, 222(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 570(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 314(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 29(sp)
+; ZVFHMIN64-NEXT:    sb a0, 221(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 568(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 312(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 28(sp)
+; ZVFHMIN64-NEXT:    sb a0, 220(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 566(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 310(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 27(sp)
+; ZVFHMIN64-NEXT:    sb a0, 219(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 564(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 308(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 26(sp)
+; ZVFHMIN64-NEXT:    sb a0, 218(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 562(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 306(sp)
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 7
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 29
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 6
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 28
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 5
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 27
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 4
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 26
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 3
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 25
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 2
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 24
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 1
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 23
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v26, v8, 15
+; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 14
+; ZVFHMIN64-NEXT:    vslidedown.vi v28, v8, 13
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 12
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 1
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v4, v8, 11
+; ZVFHMIN64-NEXT:    vslidedown.vi v2, v8, 10
+; ZVFHMIN64-NEXT:    vslidedown.vi v30, v8, 9
+; ZVFHMIN64-NEXT:    vslidedown.vi v22, v8, 8
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v16
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 25(sp)
+; ZVFHMIN64-NEXT:    sb a0, 217(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 560(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 304(sp)
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v3, v16, 7
+; ZVFHMIN64-NEXT:    vslidedown.vi v31, v16, 6
+; ZVFHMIN64-NEXT:    vslidedown.vi v5, v16, 5
+; ZVFHMIN64-NEXT:    vslidedown.vi v23, v16, 4
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 3
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 21
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 2
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 20
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 1
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 22
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v18, v16, 15
+; ZVFHMIN64-NEXT:    vslidedown.vi v14, v16, 14
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 13
+; ZVFHMIN64-NEXT:    vslidedown.vi v12, v16, 12
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 11
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 10
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 18
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 9
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 14
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 24(sp)
+; ZVFHMIN64-NEXT:    sb a0, 216(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 558(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 302(sp)
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v13, v0, 7
+; ZVFHMIN64-NEXT:    vslidedown.vi v29, v0, 6
+; ZVFHMIN64-NEXT:    vslidedown.vi v11, v0, 5
+; ZVFHMIN64-NEXT:    vslidedown.vi v7, v0, 4
+; ZVFHMIN64-NEXT:    vslidedown.vi v9, v0, 3
+; ZVFHMIN64-NEXT:    vslidedown.vi v21, v0, 2
+; ZVFHMIN64-NEXT:    vslidedown.vi v27, v0, 1
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 15
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 2
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 14
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 13
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 6
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 12
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 12
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 11
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 10
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 10
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 4
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 9
+; ZVFHMIN64-NEXT:    vslidedown.vi v0, v0, 8
+; ZVFHMIN64-NEXT:    addi a2, sp, 800
+; ZVFHMIN64-NEXT:    vs2r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s t4, v26
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 23(sp)
+; ZVFHMIN64-NEXT:    sb a0, 215(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 556(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 300(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s t3, v20
+; ZVFHMIN64-NEXT:    vmv.x.s t1, v28
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 22(sp)
+; ZVFHMIN64-NEXT:    sb a0, 214(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 554(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 298(sp)
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 1
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vl2r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s t2, v0
+; ZVFHMIN64-NEXT:    vmv.x.s t0, v4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 21(sp)
+; ZVFHMIN64-NEXT:    sb a0, 213(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 552(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 296(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a7, v2
+; ZVFHMIN64-NEXT:    vmv.x.s a6, v30
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 20(sp)
+; ZVFHMIN64-NEXT:    sb a0, 212(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 550(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 294(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v22
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v18
+; ZVFHMIN64-NEXT:    sd a2, 96(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 19(sp)
-; ZVFHMIN64-NEXT:    lh a0, 548(sp)
-; ZVFHMIN64-NEXT:    lh a1, 292(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 18(sp)
-; ZVFHMIN64-NEXT:    lh a0, 546(sp)
-; ZVFHMIN64-NEXT:    lh a1, 290(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 17(sp)
-; ZVFHMIN64-NEXT:    lh a0, 544(sp)
-; ZVFHMIN64-NEXT:    lh a1, 288(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v0
+; ZVFHMIN64-NEXT:    sb a0, 211(sp)
+; ZVFHMIN64-NEXT:    lh a1, 548(sp)
+; ZVFHMIN64-NEXT:    lh t5, 292(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v14
+; ZVFHMIN64-NEXT:    sd a0, 104(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v8
+; ZVFHMIN64-NEXT:    sd a0, 120(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a1, 0(sp)
-; ZVFHMIN64-NEXT:    sb a0, 16(sp)
-; ZVFHMIN64-NEXT:    lh a0, 448(sp)
-; ZVFHMIN64-NEXT:    lh a1, 192(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 210(sp)
+; ZVFHMIN64-NEXT:    lh a1, 546(sp)
+; ZVFHMIN64-NEXT:    lh t5, 290(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v24
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 96(sp)
-; ZVFHMIN64-NEXT:    lh a0, 446(sp)
-; ZVFHMIN64-NEXT:    lh a1, 190(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, t5
+; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa3
+; ZVFHMIN64-NEXT:    sb a1, 209(sp)
+; ZVFHMIN64-NEXT:    lh a1, 544(sp)
+; ZVFHMIN64-NEXT:    lh t5, 288(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a4, 192(sp)
+; ZVFHMIN64-NEXT:    sb a1, 208(sp)
+; ZVFHMIN64-NEXT:    lh t5, 738(sp)
+; ZVFHMIN64-NEXT:    lh t6, 482(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v12
+; ZVFHMIN64-NEXT:    sd a0, 88(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v10
+; ZVFHMIN64-NEXT:    sd a0, 112(sp) # 8-byte Folded Spill
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t5, 177(sp)
+; ZVFHMIN64-NEXT:    lh t5, 736(sp)
+; ZVFHMIN64-NEXT:    lh t6, 480(sp)
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a1, 29
+; ZVFHMIN64-NEXT:    mul a0, a0, a1
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    lh s5, 800(a0) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a1, 28
+; ZVFHMIN64-NEXT:    mul a0, a0, a1
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    lh s6, 800(a0) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t5, 176(sp)
+; ZVFHMIN64-NEXT:    lh t5, 734(sp)
+; ZVFHMIN64-NEXT:    lh t6, 478(sp)
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a1, 27
+; ZVFHMIN64-NEXT:    mul a0, a0, a1
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    lh s7, 800(a0) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a1, 26
+; ZVFHMIN64-NEXT:    mul a0, a0, a1
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    lh s8, 800(a0) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t5, 175(sp)
+; ZVFHMIN64-NEXT:    lh t5, 732(sp)
+; ZVFHMIN64-NEXT:    lh t6, 476(sp)
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a1, 25
+; ZVFHMIN64-NEXT:    mul a0, a0, a1
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    lh s4, 800(a0) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a1, 24
+; ZVFHMIN64-NEXT:    mul a0, a0, a1
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    lh s3, 800(a0) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t5, 174(sp)
+; ZVFHMIN64-NEXT:    lh t6, 730(sp)
+; ZVFHMIN64-NEXT:    lh s9, 474(sp)
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    li a1, 23
+; ZVFHMIN64-NEXT:    mul a0, a0, a1
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    lh s2, 800(a0) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s t5, v3
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s9
+; ZVFHMIN64-NEXT:    feq.h t6, fa5, fa4
+; ZVFHMIN64-NEXT:    sb t6, 173(sp)
+; ZVFHMIN64-NEXT:    lh s9, 728(sp)
+; ZVFHMIN64-NEXT:    lh s10, 472(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s t6, v31
+; ZVFHMIN64-NEXT:    vmv.x.s ra, v13
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
+; ZVFHMIN64-NEXT:    feq.h s9, fa5, fa4
+; ZVFHMIN64-NEXT:    sb s9, 172(sp)
+; ZVFHMIN64-NEXT:    lh s9, 726(sp)
+; ZVFHMIN64-NEXT:    lh s10, 470(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v29
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v11
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
+; ZVFHMIN64-NEXT:    feq.h s9, fa5, fa4
+; ZVFHMIN64-NEXT:    sb s9, 171(sp)
+; ZVFHMIN64-NEXT:    lh s10, 724(sp)
+; ZVFHMIN64-NEXT:    lh s11, 468(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v7
+; ZVFHMIN64-NEXT:    vmv.x.s s9, v9
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s10
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s11
+; ZVFHMIN64-NEXT:    feq.h s10, fa5, fa4
+; ZVFHMIN64-NEXT:    sb s10, 170(sp)
+; ZVFHMIN64-NEXT:    lh a0, 722(sp)
+; ZVFHMIN64-NEXT:    lh a1, 466(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s s10, v21
+; ZVFHMIN64-NEXT:    vmv.x.s s11, v27
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 95(sp)
-; ZVFHMIN64-NEXT:    lh a0, 444(sp)
-; ZVFHMIN64-NEXT:    lh a1, 188(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    sb a0, 169(sp)
+; ZVFHMIN64-NEXT:    lh a0, 720(sp)
+; ZVFHMIN64-NEXT:    lh a1, 464(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s5
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s6
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    sb a0, 168(sp)
+; ZVFHMIN64-NEXT:    lh a0, 718(sp)
+; ZVFHMIN64-NEXT:    lh a1, 462(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, s7
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, s8
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa1, fa0
+; ZVFHMIN64-NEXT:    fmv.h.x fa1, ra
+; ZVFHMIN64-NEXT:    sb a0, 167(sp)
+; ZVFHMIN64-NEXT:    lh a0, 716(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa0, a2
+; ZVFHMIN64-NEXT:    lh a1, 460(sp)
+; ZVFHMIN64-NEXT:    feq.h s5, fa5, fa1
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 94(sp)
-; ZVFHMIN64-NEXT:    lh a0, 442(sp)
-; ZVFHMIN64-NEXT:    lh a1, 186(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s4
+; ZVFHMIN64-NEXT:    sb a1, 166(sp)
+; ZVFHMIN64-NEXT:    lh a1, 714(sp)
+; ZVFHMIN64-NEXT:    lh a2, 458(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h a3, fa3, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa3
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s3
+; ZVFHMIN64-NEXT:    sb a1, 165(sp)
+; ZVFHMIN64-NEXT:    lh a1, 712(sp)
+; ZVFHMIN64-NEXT:    lh a2, 456(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a4
+; ZVFHMIN64-NEXT:    feq.h a4, fa2, fa3
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa3, fa2
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, s2
+; ZVFHMIN64-NEXT:    sb a1, 164(sp)
+; ZVFHMIN64-NEXT:    lh a1, 710(sp)
+; ZVFHMIN64-NEXT:    lh a2, 454(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, s9
+; ZVFHMIN64-NEXT:    feq.h s2, fa5, fa2
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa2
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s10
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, s11
+; ZVFHMIN64-NEXT:    sb a1, 163(sp)
+; ZVFHMIN64-NEXT:    lh a1, 708(sp)
+; ZVFHMIN64-NEXT:    lh a2, 452(sp)
+; ZVFHMIN64-NEXT:    feq.h s3, fa4, fa5
+; ZVFHMIN64-NEXT:    feq.h s4, fa3, fa2
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a1, 162(sp)
+; ZVFHMIN64-NEXT:    lh a1, 706(sp)
+; ZVFHMIN64-NEXT:    lh a2, 450(sp)
+; ZVFHMIN64-NEXT:    sb s4, 129(sp)
+; ZVFHMIN64-NEXT:    sb s3, 130(sp)
+; ZVFHMIN64-NEXT:    sb s2, 131(sp)
+; ZVFHMIN64-NEXT:    sb a4, 132(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a3, 133(sp)
+; ZVFHMIN64-NEXT:    sb a0, 134(sp)
+; ZVFHMIN64-NEXT:    sb s5, 135(sp)
+; ZVFHMIN64-NEXT:    sb a1, 161(sp)
+; ZVFHMIN64-NEXT:    lh a0, 610(sp)
+; ZVFHMIN64-NEXT:    lh a1, 354(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s s6, v5
+; ZVFHMIN64-NEXT:    vmv.x.s s5, v23
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 93(sp)
-; ZVFHMIN64-NEXT:    lh a0, 440(sp)
-; ZVFHMIN64-NEXT:    lh a1, 184(sp)
+; ZVFHMIN64-NEXT:    sb a0, 241(sp)
+; ZVFHMIN64-NEXT:    lh a0, 608(sp)
+; ZVFHMIN64-NEXT:    lh a1, 352(sp)
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 21
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh s4, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 20
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh s3, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 92(sp)
-; ZVFHMIN64-NEXT:    lh a0, 438(sp)
-; ZVFHMIN64-NEXT:    lh a1, 182(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    sb a0, 240(sp)
+; ZVFHMIN64-NEXT:    lh a0, 606(sp)
+; ZVFHMIN64-NEXT:    lh a1, 350(sp)
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 22
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh s2, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa3
+; ZVFHMIN64-NEXT:    sb a0, 239(sp)
+; ZVFHMIN64-NEXT:    lh a0, 604(sp)
+; ZVFHMIN64-NEXT:    lh a1, 348(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t6
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 7
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    sb a0, 238(sp)
+; ZVFHMIN64-NEXT:    lh a0, 602(sp)
+; ZVFHMIN64-NEXT:    lh a1, 346(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 6
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    sb a0, 237(sp)
+; ZVFHMIN64-NEXT:    lh a0, 600(sp)
+; ZVFHMIN64-NEXT:    lh a1, 344(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 5
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    sb a0, 236(sp)
+; ZVFHMIN64-NEXT:    lh a0, 598(sp)
+; ZVFHMIN64-NEXT:    lh a1, 342(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 4
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    sb a0, 235(sp)
+; ZVFHMIN64-NEXT:    lh a0, 596(sp)
+; ZVFHMIN64-NEXT:    lh a1, 340(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s s8, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 3
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    sb a0, 234(sp)
+; ZVFHMIN64-NEXT:    lh a0, 594(sp)
+; ZVFHMIN64-NEXT:    lh a1, 338(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s s9, v8
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 2
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    sb a0, 233(sp)
+; ZVFHMIN64-NEXT:    lh a0, 592(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN64-NEXT:    lh t5, 336(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 1
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a0
+; ZVFHMIN64-NEXT:    vmv.x.s s7, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, t5
+; ZVFHMIN64-NEXT:    feq.h a0, fa3, fa2
+; ZVFHMIN64-NEXT:    fmv.h.x fa3, a2
+; ZVFHMIN64-NEXT:    sb a0, 232(sp)
+; ZVFHMIN64-NEXT:    lh a0, 590(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa2, a3
+; ZVFHMIN64-NEXT:    lh a2, 334(sp)
+; ZVFHMIN64-NEXT:    feq.h t5, fa5, fa3
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    feq.h t6, fa4, fa2
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 91(sp)
-; ZVFHMIN64-NEXT:    lh a0, 436(sp)
-; ZVFHMIN64-NEXT:    lh a1, 180(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s6
+; ZVFHMIN64-NEXT:    sb a0, 231(sp)
+; ZVFHMIN64-NEXT:    lh a0, 588(sp)
+; ZVFHMIN64-NEXT:    lh a2, 332(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
+; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 90(sp)
-; ZVFHMIN64-NEXT:    lh a0, 434(sp)
-; ZVFHMIN64-NEXT:    lh a1, 178(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s5
+; ZVFHMIN64-NEXT:    sb a0, 230(sp)
+; ZVFHMIN64-NEXT:    lh a0, 586(sp)
+; ZVFHMIN64-NEXT:    lh a2, 330(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s8
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 89(sp)
-; ZVFHMIN64-NEXT:    lh a0, 432(sp)
-; ZVFHMIN64-NEXT:    lh a1, 176(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s4
+; ZVFHMIN64-NEXT:    sb a0, 229(sp)
+; ZVFHMIN64-NEXT:    lh a0, 584(sp)
+; ZVFHMIN64-NEXT:    lh a2, 328(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s9
+; ZVFHMIN64-NEXT:    feq.h s4, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 88(sp)
-; ZVFHMIN64-NEXT:    lh a0, 430(sp)
-; ZVFHMIN64-NEXT:    lh a1, 174(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s3
+; ZVFHMIN64-NEXT:    sb a0, 228(sp)
+; ZVFHMIN64-NEXT:    lh a0, 582(sp)
+; ZVFHMIN64-NEXT:    lh a2, 326(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 87(sp)
-; ZVFHMIN64-NEXT:    lh a0, 428(sp)
-; ZVFHMIN64-NEXT:    lh a1, 172(sp)
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 86(sp)
-; ZVFHMIN64-NEXT:    lh a0, 426(sp)
-; ZVFHMIN64-NEXT:    lh a1, 170(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s2
+; ZVFHMIN64-NEXT:    sb a0, 227(sp)
+; ZVFHMIN64-NEXT:    lh a0, 580(sp)
+; ZVFHMIN64-NEXT:    lh a2, 324(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s7
+; ZVFHMIN64-NEXT:    feq.h s2, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 85(sp)
-; ZVFHMIN64-NEXT:    lh a0, 424(sp)
-; ZVFHMIN64-NEXT:    lh a1, 168(sp)
+; ZVFHMIN64-NEXT:    sb a0, 226(sp)
+; ZVFHMIN64-NEXT:    lh a0, 578(sp)
+; ZVFHMIN64-NEXT:    lh a2, 322(sp)
+; ZVFHMIN64-NEXT:    sb s2, 193(sp)
+; ZVFHMIN64-NEXT:    sb a1, 194(sp)
+; ZVFHMIN64-NEXT:    sb s4, 195(sp)
+; ZVFHMIN64-NEXT:    sb a4, 196(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 84(sp)
-; ZVFHMIN64-NEXT:    lh a0, 422(sp)
-; ZVFHMIN64-NEXT:    lh a1, 166(sp)
+; ZVFHMIN64-NEXT:    sb a3, 197(sp)
+; ZVFHMIN64-NEXT:    sb t6, 198(sp)
+; ZVFHMIN64-NEXT:    sb t5, 199(sp)
+; ZVFHMIN64-NEXT:    sb a0, 225(sp)
+; ZVFHMIN64-NEXT:    lh a0, 766(sp)
+; ZVFHMIN64-NEXT:    lh a1, 510(sp)
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 18
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s2, v8
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 14
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s t6, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 83(sp)
-; ZVFHMIN64-NEXT:    lh a0, 420(sp)
-; ZVFHMIN64-NEXT:    lh a1, 164(sp)
+; ZVFHMIN64-NEXT:    sb a0, 191(sp)
+; ZVFHMIN64-NEXT:    lh a0, 764(sp)
+; ZVFHMIN64-NEXT:    lh a1, 508(sp)
+; ZVFHMIN64-NEXT:    vmv.x.s t5, v6
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 2
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 82(sp)
-; ZVFHMIN64-NEXT:    lh a0, 418(sp)
-; ZVFHMIN64-NEXT:    lh a1, 162(sp)
+; ZVFHMIN64-NEXT:    sb a0, 190(sp)
+; ZVFHMIN64-NEXT:    lh a0, 762(sp)
+; ZVFHMIN64-NEXT:    lh a1, 506(sp)
+; ZVFHMIN64-NEXT:    csrr a3, vlenb
+; ZVFHMIN64-NEXT:    slli a3, a3, 3
+; ZVFHMIN64-NEXT:    add a3, sp, a3
+; ZVFHMIN64-NEXT:    addi a3, a3, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
+; ZVFHMIN64-NEXT:    csrr a4, vlenb
+; ZVFHMIN64-NEXT:    li s3, 6
+; ZVFHMIN64-NEXT:    mul a4, a4, s3
+; ZVFHMIN64-NEXT:    add a4, sp, a4
+; ZVFHMIN64-NEXT:    addi a4, a4, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 81(sp)
-; ZVFHMIN64-NEXT:    lh a0, 416(sp)
-; ZVFHMIN64-NEXT:    lh a1, 160(sp)
+; ZVFHMIN64-NEXT:    sb a0, 189(sp)
+; ZVFHMIN64-NEXT:    lh a0, 760(sp)
+; ZVFHMIN64-NEXT:    lh a1, 504(sp)
+; ZVFHMIN64-NEXT:    csrr s3, vlenb
+; ZVFHMIN64-NEXT:    li s4, 12
+; ZVFHMIN64-NEXT:    mul s3, s3, s4
+; ZVFHMIN64-NEXT:    add s3, sp, s3
+; ZVFHMIN64-NEXT:    addi s3, s3, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s6, v8
+; ZVFHMIN64-NEXT:    csrr s3, vlenb
+; ZVFHMIN64-NEXT:    li s4, 10
+; ZVFHMIN64-NEXT:    mul s3, s3, s4
+; ZVFHMIN64-NEXT:    add s3, sp, s3
+; ZVFHMIN64-NEXT:    addi s3, s3, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s4, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v24
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v16
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a1, 64(sp)
-; ZVFHMIN64-NEXT:    sb a0, 80(sp)
-; ZVFHMIN64-NEXT:    lh a0, 610(sp)
-; ZVFHMIN64-NEXT:    lh a1, 354(sp)
+; ZVFHMIN64-NEXT:    sb a0, 188(sp)
+; ZVFHMIN64-NEXT:    lh a0, 758(sp)
+; ZVFHMIN64-NEXT:    lh a1, 502(sp)
+; ZVFHMIN64-NEXT:    csrr s3, vlenb
+; ZVFHMIN64-NEXT:    slli s3, s3, 4
+; ZVFHMIN64-NEXT:    add s3, sp, s3
+; ZVFHMIN64-NEXT:    addi s3, s3, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s5, v8
+; ZVFHMIN64-NEXT:    vmv.x.s s3, v16
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 49(sp)
-; ZVFHMIN64-NEXT:    lh a0, 608(sp)
-; ZVFHMIN64-NEXT:    lh a1, 352(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN64-NEXT:    sb a0, 187(sp)
+; ZVFHMIN64-NEXT:    lh a0, 756(sp)
+; ZVFHMIN64-NEXT:    lh a1, 500(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
+; ZVFHMIN64-NEXT:    feq.h t4, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 48(sp)
-; ZVFHMIN64-NEXT:    lh a0, 606(sp)
-; ZVFHMIN64-NEXT:    lh a1, 350(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t3
+; ZVFHMIN64-NEXT:    sb a0, 186(sp)
+; ZVFHMIN64-NEXT:    lh a0, 754(sp)
+; ZVFHMIN64-NEXT:    lh a1, 498(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    feq.h t3, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 47(sp)
-; ZVFHMIN64-NEXT:    lh a1, 604(sp)
-; ZVFHMIN64-NEXT:    lh a2, 348(sp)
-; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 7
-; ZVFHMIN64-NEXT:    vmv.x.s a0, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 46(sp)
-; ZVFHMIN64-NEXT:    lh a2, 602(sp)
-; ZVFHMIN64-NEXT:    lh a3, 346(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 7
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a2, 45(sp)
-; ZVFHMIN64-NEXT:    lh a3, 600(sp)
-; ZVFHMIN64-NEXT:    lh a4, 344(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 6
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a3, 44(sp)
-; ZVFHMIN64-NEXT:    lh a4, 598(sp)
-; ZVFHMIN64-NEXT:    lh a5, 342(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 6
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a4, 43(sp)
-; ZVFHMIN64-NEXT:    lh a5, 596(sp)
-; ZVFHMIN64-NEXT:    lh a6, 340(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 5
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a5
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a6
-; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a5, 42(sp)
-; ZVFHMIN64-NEXT:    lh a6, 594(sp)
-; ZVFHMIN64-NEXT:    lh a7, 338(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 5
-; ZVFHMIN64-NEXT:    vmv.x.s a5, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN64-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a6, 41(sp)
-; ZVFHMIN64-NEXT:    lh a7, 592(sp)
-; ZVFHMIN64-NEXT:    lh t0, 336(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 4
-; ZVFHMIN64-NEXT:    vmv.x.s a6, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a7
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t0
-; ZVFHMIN64-NEXT:    feq.h a7, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a7, 40(sp)
-; ZVFHMIN64-NEXT:    lh t0, 590(sp)
-; ZVFHMIN64-NEXT:    lh t1, 334(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 4
-; ZVFHMIN64-NEXT:    vmv.x.s a7, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT:    feq.h t0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t0, 39(sp)
-; ZVFHMIN64-NEXT:    lh t1, 588(sp)
-; ZVFHMIN64-NEXT:    lh t2, 332(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 3
-; ZVFHMIN64-NEXT:    vmv.x.s t0, v10
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, t1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT:    sb a0, 185(sp)
+; ZVFHMIN64-NEXT:    lh a0, 752(sp)
+; ZVFHMIN64-NEXT:    lh a1, 496(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
 ; ZVFHMIN64-NEXT:    feq.h t1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t1, 38(sp)
-; ZVFHMIN64-NEXT:    lh t2, 586(sp)
-; ZVFHMIN64-NEXT:    lh t3, 330(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 3
-; ZVFHMIN64-NEXT:    vmv.x.s t1, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t2, 37(sp)
-; ZVFHMIN64-NEXT:    lh t2, 584(sp)
-; ZVFHMIN64-NEXT:    lh t3, 328(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 2
-; ZVFHMIN64-NEXT:    vmv.x.s t4, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t2, 36(sp)
-; ZVFHMIN64-NEXT:    lh t2, 582(sp)
-; ZVFHMIN64-NEXT:    lh t3, 326(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 2
-; ZVFHMIN64-NEXT:    vmv.x.s t5, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t2, 35(sp)
-; ZVFHMIN64-NEXT:    lh t2, 580(sp)
-; ZVFHMIN64-NEXT:    lh t3, 324(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 1
-; ZVFHMIN64-NEXT:    vmv.x.s t6, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t2, 34(sp)
-; ZVFHMIN64-NEXT:    lh t2, 578(sp)
-; ZVFHMIN64-NEXT:    lh t3, 322(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 1
-; ZVFHMIN64-NEXT:    vmv.x.s s2, v10
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 184(sp)
+; ZVFHMIN64-NEXT:    lh a0, 750(sp)
+; ZVFHMIN64-NEXT:    lh a1, 494(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s6
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a2, 5(sp)
-; ZVFHMIN64-NEXT:    sb a1, 6(sp)
-; ZVFHMIN64-NEXT:    sb a0, 7(sp)
-; ZVFHMIN64-NEXT:    sb t2, 33(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
-; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s2
-; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a3, 1(sp)
-; ZVFHMIN64-NEXT:    sb a2, 2(sp)
-; ZVFHMIN64-NEXT:    sb a1, 3(sp)
-; ZVFHMIN64-NEXT:    sb a0, 4(sp)
-; ZVFHMIN64-NEXT:    lh a0, 482(sp)
-; ZVFHMIN64-NEXT:    lh a1, 226(sp)
+; ZVFHMIN64-NEXT:    sb a0, 183(sp)
+; ZVFHMIN64-NEXT:    lh a0, 748(sp)
+; ZVFHMIN64-NEXT:    lh a1, 492(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s4
+; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 113(sp)
-; ZVFHMIN64-NEXT:    lh a0, 480(sp)
-; ZVFHMIN64-NEXT:    lh a1, 224(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a7
+; ZVFHMIN64-NEXT:    sb a0, 182(sp)
+; ZVFHMIN64-NEXT:    lh a0, 746(sp)
+; ZVFHMIN64-NEXT:    lh a1, 490(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s5
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 112(sp)
-; ZVFHMIN64-NEXT:    lh a0, 478(sp)
-; ZVFHMIN64-NEXT:    lh a1, 222(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
+; ZVFHMIN64-NEXT:    sb a0, 181(sp)
+; ZVFHMIN64-NEXT:    lh a0, 744(sp)
+; ZVFHMIN64-NEXT:    lh a1, 488(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, s3
+; ZVFHMIN64-NEXT:    feq.h a6, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 111(sp)
-; ZVFHMIN64-NEXT:    lh a1, 476(sp)
-; ZVFHMIN64-NEXT:    lh a2, 220(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 7
-; ZVFHMIN64-NEXT:    vmv.x.s a0, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a1, 110(sp)
-; ZVFHMIN64-NEXT:    lh a2, 474(sp)
-; ZVFHMIN64-NEXT:    lh a3, 218(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 7
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a2, 109(sp)
-; ZVFHMIN64-NEXT:    lh a3, 472(sp)
-; ZVFHMIN64-NEXT:    lh a4, 216(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 6
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a3, 108(sp)
-; ZVFHMIN64-NEXT:    lh a4, 470(sp)
-; ZVFHMIN64-NEXT:    lh a5, 214(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 6
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a4, 107(sp)
-; ZVFHMIN64-NEXT:    lh a5, 468(sp)
-; ZVFHMIN64-NEXT:    lh a6, 212(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 5
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v10
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a5
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a6
-; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a5, 106(sp)
-; ZVFHMIN64-NEXT:    lh a6, 466(sp)
-; ZVFHMIN64-NEXT:    lh a7, 210(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 5
-; ZVFHMIN64-NEXT:    vmv.x.s a5, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
+; ZVFHMIN64-NEXT:    addi a1, sp, 800
+; ZVFHMIN64-NEXT:    vl2r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 15
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v8
+; ZVFHMIN64-NEXT:    sb a0, 180(sp)
+; ZVFHMIN64-NEXT:    lh a0, 742(sp)
+; ZVFHMIN64-NEXT:    lh a7, 486(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    feq.h a1, fa5, fa4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN64-NEXT:    feq.h a6, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a6, 105(sp)
-; ZVFHMIN64-NEXT:    lh a7, 464(sp)
-; ZVFHMIN64-NEXT:    lh t0, 208(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 4
-; ZVFHMIN64-NEXT:    vmv.x.s a6, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a7
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t0
-; ZVFHMIN64-NEXT:    feq.h a7, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a7, 104(sp)
-; ZVFHMIN64-NEXT:    lh t0, 462(sp)
-; ZVFHMIN64-NEXT:    lh t1, 206(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 4
-; ZVFHMIN64-NEXT:    vmv.x.s a7, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT:    feq.h t0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t0, 103(sp)
-; ZVFHMIN64-NEXT:    lh t1, 460(sp)
-; ZVFHMIN64-NEXT:    lh t2, 204(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 3
-; ZVFHMIN64-NEXT:    vmv.x.s t0, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t1
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
-; ZVFHMIN64-NEXT:    feq.h t1, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t1, 102(sp)
-; ZVFHMIN64-NEXT:    lh t2, 458(sp)
-; ZVFHMIN64-NEXT:    lh t3, 202(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 3
-; ZVFHMIN64-NEXT:    vmv.x.s t1, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t2, 101(sp)
-; ZVFHMIN64-NEXT:    lh t2, 456(sp)
-; ZVFHMIN64-NEXT:    lh t3, 200(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 2
-; ZVFHMIN64-NEXT:    vmv.x.s t4, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t2, 100(sp)
-; ZVFHMIN64-NEXT:    lh t2, 454(sp)
-; ZVFHMIN64-NEXT:    lh t3, 198(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 2
-; ZVFHMIN64-NEXT:    vmv.x.s t5, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t2, 99(sp)
-; ZVFHMIN64-NEXT:    lh t2, 452(sp)
-; ZVFHMIN64-NEXT:    lh t3, 196(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 1
-; ZVFHMIN64-NEXT:    vmv.x.s t6, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb t2, 98(sp)
-; ZVFHMIN64-NEXT:    lh t2, 450(sp)
-; ZVFHMIN64-NEXT:    lh t3, 194(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 1
-; ZVFHMIN64-NEXT:    vmv.x.s s2, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 179(sp)
+; ZVFHMIN64-NEXT:    lh a0, 740(sp)
+; ZVFHMIN64-NEXT:    lh a7, 484(sp)
+; ZVFHMIN64-NEXT:    sb a2, 140(sp)
+; ZVFHMIN64-NEXT:    sb t1, 141(sp)
+; ZVFHMIN64-NEXT:    sb t3, 142(sp)
+; ZVFHMIN64-NEXT:    sb t4, 143(sp)
+; ZVFHMIN64-NEXT:    sb a1, 136(sp)
+; ZVFHMIN64-NEXT:    sb a6, 137(sp)
+; ZVFHMIN64-NEXT:    sb a4, 138(sp)
+; ZVFHMIN64-NEXT:    sb a3, 139(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a2, 69(sp)
-; ZVFHMIN64-NEXT:    sb a1, 70(sp)
-; ZVFHMIN64-NEXT:    sb a0, 71(sp)
-; ZVFHMIN64-NEXT:    sb t2, 97(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
-; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s2
-; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a3, 65(sp)
-; ZVFHMIN64-NEXT:    sb a2, 66(sp)
-; ZVFHMIN64-NEXT:    sb a1, 67(sp)
-; ZVFHMIN64-NEXT:    sb a0, 68(sp)
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 178(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 638(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 382(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 14
+; ZVFHMIN64-NEXT:    vmv.x.s t3, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 63(sp)
+; ZVFHMIN64-NEXT:    sb a0, 255(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 636(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 380(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 13
+; ZVFHMIN64-NEXT:    vmv.x.s t2, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 62(sp)
+; ZVFHMIN64-NEXT:    sb a0, 254(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 634(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 378(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 12
+; ZVFHMIN64-NEXT:    vmv.x.s t1, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 61(sp)
+; ZVFHMIN64-NEXT:    sb a0, 253(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 632(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 376(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 11
+; ZVFHMIN64-NEXT:    vmv.x.s t0, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 60(sp)
+; ZVFHMIN64-NEXT:    sb a0, 252(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 630(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 374(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 10
+; ZVFHMIN64-NEXT:    vmv.x.s a7, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 59(sp)
+; ZVFHMIN64-NEXT:    sb a0, 251(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 628(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 372(sp)
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 9
+; ZVFHMIN64-NEXT:    vmv.x.s a6, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 58(sp)
+; ZVFHMIN64-NEXT:    ld a1, 96(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    sb a0, 250(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 626(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 370(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
+; ZVFHMIN64-NEXT:    feq.h a3, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 57(sp)
+; ZVFHMIN64-NEXT:    ld a1, 104(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    sb a0, 249(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 624(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 368(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
+; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 56(sp)
+; ZVFHMIN64-NEXT:    ld a1, 120(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    sb a0, 248(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 622(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 366(sp)
-; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v20, v0, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v22, v0, 14
-; ZVFHMIN64-NEXT:    vslidedown.vi v26, v0, 13
-; ZVFHMIN64-NEXT:    vslidedown.vi v28, v0, 12
-; ZVFHMIN64-NEXT:    vslidedown.vi v18, v0, 11
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v0, 10
-; ZVFHMIN64-NEXT:    vslidedown.vi v12, v0, 9
-; ZVFHMIN64-NEXT:    vslidedown.vi v14, v0, 8
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v20
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t2
+; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 55(sp)
+; ZVFHMIN64-NEXT:    ld a1, 88(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    sb a0, 247(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 620(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 364(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 15
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v20
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v22
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
+; ZVFHMIN64-NEXT:    feq.h a5, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 54(sp)
+; ZVFHMIN64-NEXT:    ld a1, 112(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
+; ZVFHMIN64-NEXT:    sb a0, 246(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 618(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 362(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 14
-; ZVFHMIN64-NEXT:    vmv.x.s a5, v20
-; ZVFHMIN64-NEXT:    vmv.x.s a6, v26
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, t0
+; ZVFHMIN64-NEXT:    feq.h t0, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 53(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, s2
+; ZVFHMIN64-NEXT:    sb a0, 245(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 616(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 360(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 13
-; ZVFHMIN64-NEXT:    vmv.x.s a7, v20
-; ZVFHMIN64-NEXT:    vmv.x.s t0, v28
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
+; ZVFHMIN64-NEXT:    feq.h a7, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 52(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
+; ZVFHMIN64-NEXT:    sb a0, 244(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 614(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 358(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 12
-; ZVFHMIN64-NEXT:    vmv.x.s t1, v20
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a6
+; ZVFHMIN64-NEXT:    feq.h a6, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 51(sp)
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 8
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
+; ZVFHMIN64-NEXT:    sb a0, 243(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 612(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 356(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t2, v18
-; ZVFHMIN64-NEXT:    vslidedown.vi v18, v8, 11
-; ZVFHMIN64-NEXT:    vmv.x.s t3, v18
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 50(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a3, 12(sp)
-; ZVFHMIN64-NEXT:    sb a2, 13(sp)
-; ZVFHMIN64-NEXT:    sb a1, 14(sp)
-; ZVFHMIN64-NEXT:    sb a0, 15(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a1
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 10
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v12
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 9
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v14
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a3
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v8, 8
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a3, 8(sp)
-; ZVFHMIN64-NEXT:    sb a2, 9(sp)
-; ZVFHMIN64-NEXT:    sb a1, 10(sp)
-; ZVFHMIN64-NEXT:    sb a0, 11(sp)
-; ZVFHMIN64-NEXT:    lh a0, 510(sp)
-; ZVFHMIN64-NEXT:    lh a1, 254(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 127(sp)
-; ZVFHMIN64-NEXT:    lh a0, 508(sp)
-; ZVFHMIN64-NEXT:    lh a1, 252(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 126(sp)
-; ZVFHMIN64-NEXT:    lh a0, 506(sp)
-; ZVFHMIN64-NEXT:    lh a1, 250(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 125(sp)
-; ZVFHMIN64-NEXT:    lh a0, 504(sp)
-; ZVFHMIN64-NEXT:    lh a1, 248(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 124(sp)
-; ZVFHMIN64-NEXT:    lh a0, 502(sp)
-; ZVFHMIN64-NEXT:    lh a1, 246(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 123(sp)
-; ZVFHMIN64-NEXT:    lh a0, 500(sp)
-; ZVFHMIN64-NEXT:    lh a1, 244(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 122(sp)
-; ZVFHMIN64-NEXT:    lh a0, 498(sp)
-; ZVFHMIN64-NEXT:    lh a1, 242(sp)
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v24, 14
-; ZVFHMIN64-NEXT:    vslidedown.vi v12, v24, 13
-; ZVFHMIN64-NEXT:    vslidedown.vi v14, v24, 12
-; ZVFHMIN64-NEXT:    vslidedown.vi v18, v24, 11
-; ZVFHMIN64-NEXT:    vslidedown.vi v20, v24, 10
-; ZVFHMIN64-NEXT:    vslidedown.vi v22, v24, 9
-; ZVFHMIN64-NEXT:    vslidedown.vi v24, v24, 8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a0, 121(sp)
-; ZVFHMIN64-NEXT:    lh a2, 496(sp)
-; ZVFHMIN64-NEXT:    lh a3, 240(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a0, v8
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 15
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
+; ZVFHMIN64-NEXT:    sb a5, 204(sp)
+; ZVFHMIN64-NEXT:    sb a4, 205(sp)
+; ZVFHMIN64-NEXT:    sb a2, 206(sp)
+; ZVFHMIN64-NEXT:    sb a3, 207(sp)
 ; ZVFHMIN64-NEXT:    feq.h a2, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a2, 120(sp)
-; ZVFHMIN64-NEXT:    lh a4, 494(sp)
-; ZVFHMIN64-NEXT:    lh a5, 238(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v10
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 14
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a4, 119(sp)
-; ZVFHMIN64-NEXT:    lh a4, 492(sp)
-; ZVFHMIN64-NEXT:    lh a5, 236(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a6, v12
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 13
-; ZVFHMIN64-NEXT:    vmv.x.s a7, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a4, 118(sp)
-; ZVFHMIN64-NEXT:    lh a4, 490(sp)
-; ZVFHMIN64-NEXT:    lh a5, 234(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t0, v14
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 12
-; ZVFHMIN64-NEXT:    vmv.x.s t1, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a4, 117(sp)
-; ZVFHMIN64-NEXT:    lh a4, 488(sp)
-; ZVFHMIN64-NEXT:    lh a5, 232(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t2, v18
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 11
-; ZVFHMIN64-NEXT:    vmv.x.s t3, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a4, 116(sp)
-; ZVFHMIN64-NEXT:    lh a4, 486(sp)
-; ZVFHMIN64-NEXT:    lh a5, 230(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t4, v20
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 10
-; ZVFHMIN64-NEXT:    vmv.x.s t5, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a4, 115(sp)
-; ZVFHMIN64-NEXT:    lh a4, 484(sp)
-; ZVFHMIN64-NEXT:    lh a5, 228(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t6, v22
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 9
-; ZVFHMIN64-NEXT:    vmv.x.s s2, v8
-; ZVFHMIN64-NEXT:    vmv.x.s s3, v24
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a5
-; ZVFHMIN64-NEXT:    feq.h a4, fa5, fa4
-; ZVFHMIN64-NEXT:    sb a4, 114(sp)
+; ZVFHMIN64-NEXT:    sb a2, 200(sp)
+; ZVFHMIN64-NEXT:    sb a6, 201(sp)
+; ZVFHMIN64-NEXT:    sb a7, 202(sp)
+; ZVFHMIN64-NEXT:    sb t0, 203(sp)
+; ZVFHMIN64-NEXT:    li a2, 128
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, a6
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a7
-; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t0
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t1
-; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a3, 76(sp)
-; ZVFHMIN64-NEXT:    sb a2, 77(sp)
-; ZVFHMIN64-NEXT:    sb a1, 78(sp)
-; ZVFHMIN64-NEXT:    sb a0, 79(sp)
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t3
-; ZVFHMIN64-NEXT:    feq.h a0, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, t5
-; ZVFHMIN64-NEXT:    feq.h a1, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, s2
-; ZVFHMIN64-NEXT:    feq.h a2, fa4, fa5
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, s3
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 8
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
-; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h a3, fa4, fa5
-; ZVFHMIN64-NEXT:    sb a3, 72(sp)
-; ZVFHMIN64-NEXT:    sb a2, 73(sp)
-; ZVFHMIN64-NEXT:    sb a1, 74(sp)
-; ZVFHMIN64-NEXT:    sb a0, 75(sp)
-; ZVFHMIN64-NEXT:    li a0, 128
-; ZVFHMIN64-NEXT:    mv a1, sp
-; ZVFHMIN64-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; ZVFHMIN64-NEXT:    vle8.v v8, (a1)
+; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
+; ZVFHMIN64-NEXT:    sb a0, 242(sp)
+; ZVFHMIN64-NEXT:    addi a0, sp, 128
+; ZVFHMIN64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; ZVFHMIN64-NEXT:    vle8.v v8, (a0)
 ; ZVFHMIN64-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN64-NEXT:    vmsne.vi v0, v8, 0
-; ZVFHMIN64-NEXT:    addi sp, s0, -768
-; ZVFHMIN64-NEXT:    .cfi_def_cfa sp, 768
-; ZVFHMIN64-NEXT:    ld ra, 760(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    ld s0, 752(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    ld s2, 744(sp) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    ld s3, 736(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    addi sp, s0, -896
+; ZVFHMIN64-NEXT:    .cfi_def_cfa sp, 896
+; ZVFHMIN64-NEXT:    ld ra, 888(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s0, 880(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s2, 872(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s3, 864(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s4, 856(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s5, 848(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s6, 840(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s7, 832(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s8, 824(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s9, 816(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s10, 808(sp) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    ld s11, 800(sp) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    .cfi_restore ra
 ; ZVFHMIN64-NEXT:    .cfi_restore s0
 ; ZVFHMIN64-NEXT:    .cfi_restore s2
 ; ZVFHMIN64-NEXT:    .cfi_restore s3
-; ZVFHMIN64-NEXT:    addi sp, sp, 768
+; ZVFHMIN64-NEXT:    .cfi_restore s4
+; ZVFHMIN64-NEXT:    .cfi_restore s5
+; ZVFHMIN64-NEXT:    .cfi_restore s6
+; ZVFHMIN64-NEXT:    .cfi_restore s7
+; ZVFHMIN64-NEXT:    .cfi_restore s8
+; ZVFHMIN64-NEXT:    .cfi_restore s9
+; ZVFHMIN64-NEXT:    .cfi_restore s10
+; ZVFHMIN64-NEXT:    .cfi_restore s11
+; ZVFHMIN64-NEXT:    addi sp, sp, 896
 ; ZVFHMIN64-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN64-NEXT:    ret
   %v = call <128 x i1> @llvm.vp.fcmp.v128f16(<128 x half> %va, <128 x half> %vb, metadata !"oeq", <128 x i1> %m, i32 %evl)
@@ -3435,38 +3947,38 @@ define <32 x i1> @fcmp_oeq_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v16, (a0)
 ; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v6, v0, 2
 ; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    bltu a2, a1, .LBB87_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB87_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v7, v8, v24, v0.t
+; CHECK-NEXT:    vmfeq.vv v7, v8, v16, v0.t
 ; CHECK-NEXT:    addi a0, a2, -16
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v16, v24, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
index 4afa75e87c8f86..d52c42891fcc3b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
@@ -28,8 +28,8 @@ define <8 x i1> @icmp_eq_vx_v8i7(<8 x i7> %va, i7 %b, <8 x i1> %m, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmseq.vv v0, v8, v9, v0.t
@@ -45,8 +45,8 @@ define <8 x i1> @icmp_eq_vx_swap_v8i7(<8 x i7> %va, i7 %b, <8 x i1> %m, i32 zero
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmseq.vv v0, v9, v8, v0.t
@@ -605,11 +605,11 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1>
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    addi a4, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a2)
-; CHECK-NEXT:    addi a2, a0, 128
-; CHECK-NEXT:    vle8.v v8, (a2)
 ; CHECK-NEXT:    addi a2, a3, -128
+; CHECK-NEXT:    vle8.v v8, (a4)
 ; CHECK-NEXT:    sltu a4, a3, a2
 ; CHECK-NEXT:    vle8.v v24, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
@@ -1253,38 +1253,38 @@ define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    li a3, 32
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v24, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v6, v0, 4
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    li a3, 32
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vle32.v v16, (a1)
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle32.v v16, (a0)
 ; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 4
 ; CHECK-NEXT:    bltu a2, a3, .LBB99_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:  .LBB99_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v7, v8, v24, v0.t
+; CHECK-NEXT:    vmseq.vv v7, v8, v16, v0.t
 ; CHECK-NEXT:    addi a0, a2, -32
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vv v8, v16, v24, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
index c9e6a8730eec7e..8b18be908089f2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-changes-length.ll
@@ -9,28 +9,28 @@ define <8 x i1> @v8i1_v16i1(<16 x i1>) {
 ; RV32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v0
 ; RV32-NEXT:    slli a1, a0, 18
-; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    srli a2, a0, 31
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV32-NEXT:    vmv.v.x v8, a2
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    slli a1, a0, 27
-; RV32-NEXT:    srli a1, a1, 31
-; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    slli a1, a0, 28
+; RV32-NEXT:    slli a2, a0, 27
 ; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    vslide1down.vx v8, v8, a1
-; RV32-NEXT:    slli a1, a0, 19
+; RV32-NEXT:    slli a1, a0, 26
 ; RV32-NEXT:    srli a1, a1, 31
-; RV32-NEXT:    slli a2, a0, 26
+; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    slli a1, a0, 28
 ; RV32-NEXT:    srli a2, a2, 31
-; RV32-NEXT:    vmv.v.x v9, a2
-; RV32-NEXT:    vslide1down.vx v9, v9, a1
-; RV32-NEXT:    slli a1, a0, 24
-; RV32-NEXT:    srli a1, a1, 31
-; RV32-NEXT:    vslide1down.vx v9, v9, a1
+; RV32-NEXT:    vslide1down.vx v8, v8, a2
+; RV32-NEXT:    slli a2, a0, 19
+; RV32-NEXT:    srli a2, a2, 31
+; RV32-NEXT:    vslide1down.vx v9, v9, a2
+; RV32-NEXT:    slli a2, a0, 24
 ; RV32-NEXT:    slli a0, a0, 29
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    srli a2, a2, 31
 ; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
+; RV32-NEXT:    vslide1down.vx v9, v9, a2
 ; RV32-NEXT:    vmv.v.i v0, 15
 ; RV32-NEXT:    vslide1down.vx v9, v9, a0
 ; RV32-NEXT:    vslidedown.vi v8, v9, 4, v0.t
@@ -43,28 +43,28 @@ define <8 x i1> @v8i1_v16i1(<16 x i1>) {
 ; RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; RV64-NEXT:    vmv.x.s a0, v0
 ; RV64-NEXT:    slli a1, a0, 50
-; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    srli a2, a0, 63
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; RV64-NEXT:    vmv.v.x v8, a2
-; RV64-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-NEXT:    slli a1, a0, 59
+; RV64-NEXT:    slli a2, a0, 59
 ; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-NEXT:    slli a1, a0, 60
+; RV64-NEXT:    slli a1, a0, 58
 ; RV64-NEXT:    srli a1, a1, 63
-; RV64-NEXT:    vslide1down.vx v8, v8, a1
-; RV64-NEXT:    slli a1, a0, 51
-; RV64-NEXT:    srli a1, a1, 63
-; RV64-NEXT:    slli a2, a0, 58
+; RV64-NEXT:    vmv.v.x v9, a1
+; RV64-NEXT:    slli a1, a0, 60
 ; RV64-NEXT:    srli a2, a2, 63
-; RV64-NEXT:    vmv.v.x v9, a2
-; RV64-NEXT:    vslide1down.vx v9, v9, a1
-; RV64-NEXT:    slli a1, a0, 56
-; RV64-NEXT:    srli a1, a1, 63
-; RV64-NEXT:    vslide1down.vx v9, v9, a1
+; RV64-NEXT:    vslide1down.vx v8, v8, a2
+; RV64-NEXT:    slli a2, a0, 51
+; RV64-NEXT:    srli a2, a2, 63
+; RV64-NEXT:    vslide1down.vx v9, v9, a2
+; RV64-NEXT:    slli a2, a0, 56
 ; RV64-NEXT:    slli a0, a0, 61
+; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    srli a2, a2, 63
 ; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    vslide1down.vx v8, v8, a1
+; RV64-NEXT:    vslide1down.vx v9, v9, a2
 ; RV64-NEXT:    vmv.v.i v0, 15
 ; RV64-NEXT:    vslide1down.vx v9, v9, a0
 ; RV64-NEXT:    vslidedown.vi v8, v9, 4, v0.t
@@ -80,10 +80,10 @@ define <4 x i32> @v4i32_v8i32(<8 x i32>) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vmv.v.i v0, 5
 ; CHECK-NEXT:    vsrl.vi v10, v10, 1
 ; CHECK-NEXT:    vrsub.vi v11, v10, 3
 ; CHECK-NEXT:    vrgather.vv v10, v8, v11
-; CHECK-NEXT:    vmv.v.i v0, 5
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 4
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
@@ -97,21 +97,20 @@ define <4 x i32> @v4i32_v8i32(<8 x i32>) {
 define <4 x i32> @v4i32_v16i32(<16 x i32>) {
 ; RV32-LABEL: v4i32_v16i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; RV32-NEXT:    vmv.v.i v12, 1
-; RV32-NEXT:    vmv.v.i v13, 6
-; RV32-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
-; RV32-NEXT:    vslideup.vi v13, v12, 1
 ; RV32-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v16, v8, 8
 ; RV32-NEXT:    vmv4r.v v20, v8
+; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV32-NEXT:    vmv.v.i v8, 1
+; RV32-NEXT:    vmv2r.v v22, v12
+; RV32-NEXT:    vmv.v.i v10, 6
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vmv2r.v v22, v14
-; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; RV32-NEXT:    vmv.v.i v0, 10
+; RV32-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
+; RV32-NEXT:    vslideup.vi v10, v8, 1
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; RV32-NEXT:    vnsrl.wx v8, v20, a0
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v13, v0.t
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v10, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: v4i32_v16i32:
@@ -121,14 +120,15 @@ define <4 x i32> @v4i32_v16i32(<16 x i32>) {
 ; RV64-NEXT:    vmv4r.v v20, v8
 ; RV64-NEXT:    li a0, 32
 ; RV64-NEXT:    vmv2r.v v22, v12
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vmv.v.i v0, 10
 ; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV64-NEXT:    vnsrl.wx v8, v20, a0
-; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT:    vmv.v.i v0, 10
 ; RV64-NEXT:    li a0, 3
 ; RV64-NEXT:    slli a0, a0, 33
 ; RV64-NEXT:    addi a0, a0, 1
 ; RV64-NEXT:    slli a0, a0, 16
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a0
 ; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; RV64-NEXT:    vrgatherei16.vv v8, v16, v10, v0.t
@@ -151,21 +151,21 @@ define <4 x i32> @v4i32_v32i32(<32 x i32>) {
 ; RV32-NEXT:    andi sp, sp, -128
 ; RV32-NEXT:    li a0, 32
 ; RV32-NEXT:    mv a1, sp
+; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v16, v8, 1
 ; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; RV32-NEXT:    vse32.v v8, (a1)
-; RV32-NEXT:    lw a0, 36(sp)
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 1
-; RV32-NEXT:    vmv.x.s a1, v10
-; RV32-NEXT:    vmv.v.x v10, a1
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    lw a0, 120(sp)
 ; RV32-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 4
-; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    lw a0, 36(sp)
+; RV32-NEXT:    vmv.x.s a1, v16
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vslide1down.vx v8, v10, a1
-; RV32-NEXT:    vslide1down.vx v8, v8, a0
+; RV32-NEXT:    vmv.v.x v9, a1
+; RV32-NEXT:    lw a1, 120(sp)
+; RV32-NEXT:    vslide1down.vx v9, v9, a0
+; RV32-NEXT:    vmv.x.s a0, v8
+; RV32-NEXT:    vslide1down.vx v8, v9, a0
+; RV32-NEXT:    vslide1down.vx v8, v8, a1
 ; RV32-NEXT:    addi sp, s0, -256
 ; RV32-NEXT:    .cfi_def_cfa sp, 256
 ; RV32-NEXT:    lw ra, 252(sp) # 4-byte Folded Reload
@@ -189,21 +189,21 @@ define <4 x i32> @v4i32_v32i32(<32 x i32>) {
 ; RV64-NEXT:    andi sp, sp, -128
 ; RV64-NEXT:    li a0, 32
 ; RV64-NEXT:    mv a1, sp
+; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v16, v8, 1
 ; RV64-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; RV64-NEXT:    vse32.v v8, (a1)
-; RV64-NEXT:    lw a0, 36(sp)
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 1
-; RV64-NEXT:    vmv.x.s a1, v10
-; RV64-NEXT:    vmv.v.x v10, a1
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    lw a0, 120(sp)
 ; RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 4
-; RV64-NEXT:    vmv.x.s a1, v8
+; RV64-NEXT:    lw a0, 36(sp)
+; RV64-NEXT:    vmv.x.s a1, v16
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vslide1down.vx v8, v10, a1
-; RV64-NEXT:    vslide1down.vx v8, v8, a0
+; RV64-NEXT:    vmv.v.x v9, a1
+; RV64-NEXT:    lw a1, 120(sp)
+; RV64-NEXT:    vslide1down.vx v9, v9, a0
+; RV64-NEXT:    vmv.x.s a0, v8
+; RV64-NEXT:    vslide1down.vx v8, v9, a0
+; RV64-NEXT:    vslide1down.vx v8, v8, a1
 ; RV64-NEXT:    addi sp, s0, -256
 ; RV64-NEXT:    .cfi_def_cfa sp, 256
 ; RV64-NEXT:    ld ra, 248(sp) # 8-byte Folded Reload
@@ -251,22 +251,24 @@ define <16 x i32> @v16i32_v4i32(<4 x i32>) {
 ; CHECK-LABEL: v16i32_v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    lui a0, 2
-; CHECK-NEXT:    addi a1, a0, 265
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 3
-; CHECK-NEXT:    vmerge.vim v10, v9, 2, v0
+; CHECK-NEXT:    addi a1, a0, 265
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    lui a1, 4
 ; CHECK-NEXT:    addi a1, a1, 548
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmerge.vim v9, v9, 2, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    addi a0, a0, -1856
-; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmerge.vim v10, v10, 0, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v9, v9, 0, v0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vsext.vf2 v16, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -285,22 +287,23 @@ define <32 x i32> @v32i32_v4i32(<4 x i32>) {
 ; CHECK-NEXT:    addi a1, a1, 1161
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    lui a1, 270865
+; CHECK-NEXT:    addi a1, a1, 548
+; CHECK-NEXT:    vmv.s.x v9, a1
+; CHECK-NEXT:    lui a1, 100550
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 3
-; CHECK-NEXT:    vmerge.vim v10, v10, 2, v0
-; CHECK-NEXT:    lui a0, 270865
-; CHECK-NEXT:    addi a0, a0, 548
+; CHECK-NEXT:    addi a0, a1, 64
+; CHECK-NEXT:    vmerge.vim v18, v10, 2, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    lui a0, 100550
-; CHECK-NEXT:    addi a0, a0, 64
-; CHECK-NEXT:    vmv.s.x v9, a0
-; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v10, v10, 0, v0
+; CHECK-NEXT:    vmv.s.x v16, a0
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v18, v18, 0, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vim v16, v18, 1, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vsext.vf2 v24, v10
+; CHECK-NEXT:    vsext.vf2 v24, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v16, v8, v24
 ; CHECK-NEXT:    vmv.v.v v8, v16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
index 925366e8b1d500..38026bb591f797 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -51,12 +51,10 @@ define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x
 ; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; VLA-NEXT:    vslideup.vi v14, v15, 1
 ; VLA-NEXT:    vslideup.vi v12, v13, 1
-; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; VLA-NEXT:    vslideup.vi v12, v14, 2
-; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; VLA-NEXT:    vslideup.vi v10, v11, 1
 ; VLA-NEXT:    vslideup.vi v8, v9, 1
 ; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vslideup.vi v12, v14, 2
 ; VLA-NEXT:    vslideup.vi v8, v10, 2
 ; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; VLA-NEXT:    vslideup.vi v8, v12, 4
@@ -65,9 +63,9 @@ define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x
 ; VLS-LABEL: concat_8xv1i32:
 ; VLS:       # %bb.0:
 ; VLS-NEXT:    vmv1r.v v17, v12
+; VLS-NEXT:    vmv1r.v v16, v8
 ; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; VLS-NEXT:    vslideup.vi v14, v15, 1
-; VLS-NEXT:    vmv1r.v v16, v8
 ; VLS-NEXT:    vslideup.vi v17, v13, 1
 ; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; VLS-NEXT:    vslideup.vi v17, v14, 2
@@ -131,12 +129,10 @@ define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
 ; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; VLA-NEXT:    vslideup.vi v14, v15, 2
 ; VLA-NEXT:    vslideup.vi v12, v13, 2
-; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLA-NEXT:    vslideup.vi v12, v14, 4
-; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; VLA-NEXT:    vslideup.vi v10, v11, 2
 ; VLA-NEXT:    vslideup.vi v8, v9, 2
 ; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v12, v14, 4
 ; VLA-NEXT:    vslideup.vi v8, v10, 4
 ; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; VLA-NEXT:    vslideup.vi v8, v12, 8
@@ -187,10 +183,10 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x
 ; VLA-NEXT:    vmv2r.v v20, v14
 ; VLA-NEXT:    vmv2r.v v16, v12
 ; VLA-NEXT:    vmv2r.v v12, v10
+; VLA-NEXT:    li a0, 32
 ; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; VLA-NEXT:    vslideup.vi v16, v20, 8
 ; VLA-NEXT:    vslideup.vi v8, v12, 8
-; VLA-NEXT:    li a0, 32
 ; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; VLA-NEXT:    vslideup.vi v8, v16, 16
 ; VLA-NEXT:    ret
@@ -209,22 +205,20 @@ define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x
 ; VLA:       # %bb.0:
 ; VLA-NEXT:    vmv1r.v v18, v15
 ; VLA-NEXT:    vmv1r.v v20, v14
-; VLA-NEXT:    vmv1r.v v22, v13
+; VLA-NEXT:    vmv1r.v v14, v13
 ; VLA-NEXT:    vmv1r.v v16, v12
-; VLA-NEXT:    vmv1r.v v14, v11
+; VLA-NEXT:    vmv1r.v v22, v11
 ; VLA-NEXT:    vmv1r.v v12, v10
 ; VLA-NEXT:    vmv1r.v v10, v9
+; VLA-NEXT:    li a0, 32
 ; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; VLA-NEXT:    vslideup.vi v20, v18, 4
-; VLA-NEXT:    vslideup.vi v16, v22, 4
-; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; VLA-NEXT:    vslideup.vi v16, v20, 8
-; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; VLA-NEXT:    vslideup.vi v12, v14, 4
+; VLA-NEXT:    vslideup.vi v16, v14, 4
+; VLA-NEXT:    vslideup.vi v12, v22, 4
 ; VLA-NEXT:    vslideup.vi v8, v10, 4
 ; VLA-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; VLA-NEXT:    vslideup.vi v16, v20, 8
 ; VLA-NEXT:    vslideup.vi v8, v12, 8
-; VLA-NEXT:    li a0, 32
 ; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; VLA-NEXT:    vslideup.vi v8, v16, 16
 ; VLA-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
index a8f75f8d1c24d1..f04faf5cd2c54f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll
@@ -15,15 +15,15 @@ define void @deinterleave3_0_i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    li a0, 3
 ; CHECK-NEXT:    vmul.vx v9, v9, a0
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vadd.vi v9, v9, -8
 ; CHECK-NEXT:    li a0, 56
 ; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vadd.vi v10, v9, -8
+; CHECK-NEXT:    vrgather.vv v11, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vse8.v v10, (a1)
+; CHECK-NEXT:    vrgather.vv v11, v8, v10, v0.t
+; CHECK-NEXT:    vse8.v v11, (a1)
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i8>, ptr %in, align 1
@@ -42,13 +42,13 @@ define void @deinterleave3_8_i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 3
 ; CHECK-NEXT:    vmadd.vx v10, a0, v9
+; CHECK-NEXT:    li a0, 24
 ; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsrl.vi v10, v8, 8
 ; CHECK-NEXT:    vsll.vi v8, v8, 8
-; CHECK-NEXT:    li a0, 24
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vor.vv v8, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
@@ -67,13 +67,13 @@ define void @deinterleave4_0_i8(ptr %in, ptr %out) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vmv.v.i v0, 12
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 4
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vwaddu.vv v10, v8, v9
-; CHECK-NEXT:    li a0, -1
 ; CHECK-NEXT:    vwmaccu.vx v10, a0, v9
-; CHECK-NEXT:    vmv.v.i v0, 12
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    vsll.vi v9, v9, 2
@@ -100,11 +100,11 @@ define void @deinterleave4_8_i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    vmv.v.i v9, -9
 ; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 5
+; CHECK-NEXT:    vmv.v.i v0, 12
 ; CHECK-NEXT:    vmacc.vx v9, a0, v10
 ; CHECK-NEXT:    vsll.vi v10, v10, 2
 ; CHECK-NEXT:    vadd.vi v10, v10, 1
 ; CHECK-NEXT:    vrgather.vv v11, v8, v10
-; CHECK-NEXT:    vmv.v.i v0, 12
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
@@ -126,15 +126,15 @@ define void @deinterleave5_0_i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    li a0, 5
-; CHECK-NEXT:    vmul.vx v9, v9, a0
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vadd.vi v9, v9, -8
 ; CHECK-NEXT:    vmv.v.i v0, 12
+; CHECK-NEXT:    vmul.vx v9, v9, a0
+; CHECK-NEXT:    vadd.vi v10, v9, -8
+; CHECK-NEXT:    vrgather.vv v11, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vrgather.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vse8.v v10, (a1)
+; CHECK-NEXT:    vrgather.vv v11, v8, v10, v0.t
+; CHECK-NEXT:    vse8.v v11, (a1)
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i8>, ptr %in, align 1
@@ -153,8 +153,8 @@ define void @deinterleave5_8_i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 5
 ; CHECK-NEXT:    vmadd.vx v10, a0, v9
-; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.i v0, 4
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
@@ -176,9 +176,9 @@ define void @deinterleave6_0_i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    li a0, 6
+; CHECK-NEXT:    vmv.v.i v0, 4
 ; CHECK-NEXT:    vmul.vx v9, v9, a0
 ; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.i v0, 4
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
@@ -202,8 +202,8 @@ define void @deinterleave6_8_i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 6
 ; CHECK-NEXT:    vmadd.vx v10, a0, v9
-; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vmv.v.i v0, 4
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
@@ -225,9 +225,9 @@ define void @deinterleave7_0_i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    li a0, 7
+; CHECK-NEXT:    vmv.v.i v0, 4
 ; CHECK-NEXT:    vmul.vx v9, v9, a0
 ; CHECK-NEXT:    vrgather.vv v10, v8, v9
-; CHECK-NEXT:    vmv.v.i v0, 4
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
@@ -250,8 +250,8 @@ define void @deinterleave7_8_i8(ptr %in, ptr %out) {
 ; CHECK-NEXT:    vmv.v.i v9, -6
 ; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 6
-; CHECK-NEXT:    vmadd.vx v10, a0, v9
 ; CHECK-NEXT:    vmv.v.i v0, 6
+; CHECK-NEXT:    vmadd.vx v10, a0, v9
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v9, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
index f5c45ba9ea5817..407535831aedad 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-reverse.ll
@@ -93,18 +93,19 @@ define <32 x i1> @reverse_v32i1(<32 x i1> %a) {
 ; NO-ZVBB-LABEL: reverse_v32i1:
 ; NO-ZVBB:       # %bb.0:
 ; NO-ZVBB-NEXT:    li a0, 32
-; NO-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; NO-ZVBB-NEXT:    vmv.v.i v8, 0
-; NO-ZVBB-NEXT:    vmerge.vim v8, v8, 1, v0
 ; NO-ZVBB-NEXT:    csrr a1, vlenb
+; NO-ZVBB-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; NO-ZVBB-NEXT:    vid.v v8
+; NO-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; NO-ZVBB-NEXT:    vmv.v.i v10, 0
 ; NO-ZVBB-NEXT:    addi a2, a1, -1
+; NO-ZVBB-NEXT:    slli a1, a1, 1
+; NO-ZVBB-NEXT:    vmerge.vim v10, v10, 1, v0
 ; NO-ZVBB-NEXT:    vsetvli a3, zero, e16, m2, ta, ma
-; NO-ZVBB-NEXT:    vid.v v10
-; NO-ZVBB-NEXT:    vrsub.vx v10, v10, a2
+; NO-ZVBB-NEXT:    vrsub.vx v8, v8, a2
 ; NO-ZVBB-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; NO-ZVBB-NEXT:    vrgatherei16.vv v13, v8, v10
-; NO-ZVBB-NEXT:    vrgatherei16.vv v12, v9, v10
-; NO-ZVBB-NEXT:    slli a1, a1, 1
+; NO-ZVBB-NEXT:    vrgatherei16.vv v13, v10, v8
+; NO-ZVBB-NEXT:    vrgatherei16.vv v12, v11, v8
 ; NO-ZVBB-NEXT:    addi a1, a1, -32
 ; NO-ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; NO-ZVBB-NEXT:    vslidedown.vx v8, v12, a1
@@ -124,23 +125,24 @@ define <64 x i1> @reverse_v64i1(<64 x i1> %a) {
 ; NO-ZVBB-LABEL: reverse_v64i1:
 ; NO-ZVBB:       # %bb.0:
 ; NO-ZVBB-NEXT:    li a0, 64
+; NO-ZVBB-NEXT:    csrr a1, vlenb
+; NO-ZVBB-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; NO-ZVBB-NEXT:    vid.v v12
 ; NO-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; NO-ZVBB-NEXT:    vmv.v.i v8, 0
-; NO-ZVBB-NEXT:    vmerge.vim v12, v8, 1, v0
-; NO-ZVBB-NEXT:    csrr a1, vlenb
 ; NO-ZVBB-NEXT:    addi a2, a1, -1
+; NO-ZVBB-NEXT:    slli a1, a1, 2
+; NO-ZVBB-NEXT:    vmerge.vim v8, v8, 1, v0
 ; NO-ZVBB-NEXT:    vsetvli a3, zero, e16, m2, ta, ma
-; NO-ZVBB-NEXT:    vid.v v8
-; NO-ZVBB-NEXT:    vrsub.vx v16, v8, a2
+; NO-ZVBB-NEXT:    vrsub.vx v12, v12, a2
 ; NO-ZVBB-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; NO-ZVBB-NEXT:    vrgatherei16.vv v11, v12, v16
-; NO-ZVBB-NEXT:    vrgatherei16.vv v10, v13, v16
-; NO-ZVBB-NEXT:    vrgatherei16.vv v9, v14, v16
-; NO-ZVBB-NEXT:    vrgatherei16.vv v8, v15, v16
-; NO-ZVBB-NEXT:    slli a1, a1, 2
+; NO-ZVBB-NEXT:    vrgatherei16.vv v19, v8, v12
+; NO-ZVBB-NEXT:    vrgatherei16.vv v18, v9, v12
+; NO-ZVBB-NEXT:    vrgatherei16.vv v17, v10, v12
+; NO-ZVBB-NEXT:    vrgatherei16.vv v16, v11, v12
 ; NO-ZVBB-NEXT:    addi a1, a1, -64
 ; NO-ZVBB-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; NO-ZVBB-NEXT:    vslidedown.vx v8, v8, a1
+; NO-ZVBB-NEXT:    vslidedown.vx v8, v16, a1
 ; NO-ZVBB-NEXT:    vmsne.vi v0, v8, 0
 ; NO-ZVBB-NEXT:    ret
 ;
@@ -157,13 +159,15 @@ define <128 x i1> @reverse_v128i1(<128 x i1> %a) {
 ; CHECK-LABEL: reverse_v128i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v16, v8, 1, v0
 ; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 0
 ; CHECK-NEXT:    addi a2, a1, -1
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vid.v v8
 ; CHECK-NEXT:    vrsub.vx v24, v8, a2
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v15, v16, v24
@@ -174,7 +178,6 @@ define <128 x i1> @reverse_v128i1(<128 x i1> %a) {
 ; CHECK-NEXT:    vrgatherei16.vv v10, v21, v24
 ; CHECK-NEXT:    vrgatherei16.vv v9, v22, v24
 ; CHECK-NEXT:    vrgatherei16.vv v8, v23, v24
-; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    addi a1, a1, -128
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a1
@@ -253,15 +256,15 @@ define <32 x i8> @reverse_v32i8(<32 x i8> %a) {
 ; CHECK-LABEL: reverse_v32i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    addi a1, a0, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    vrsub.vx v10, v10, a1
+; CHECK-NEXT:    addi a0, a0, -32
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v13, v8, v10
 ; CHECK-NEXT:    vrgatherei16.vv v12, v9, v10
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    addi a0, a0, -32
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v12, a0
@@ -274,20 +277,20 @@ define <64 x i8> @reverse_v64i8(<64 x i8> %a) {
 ; CHECK-LABEL: reverse_v64i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    addi a1, a0, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vx v16, v12, a1
-; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v15, v8, v16
-; CHECK-NEXT:    vrgatherei16.vv v14, v9, v16
-; CHECK-NEXT:    vrgatherei16.vv v13, v10, v16
-; CHECK-NEXT:    vrgatherei16.vv v12, v11, v16
+; CHECK-NEXT:    addi a1, a0, -1
 ; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    vrsub.vx v12, v12, a1
 ; CHECK-NEXT:    addi a0, a0, -64
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v19, v8, v12
+; CHECK-NEXT:    vrgatherei16.vv v18, v9, v12
+; CHECK-NEXT:    vrgatherei16.vv v17, v10, v12
+; CHECK-NEXT:    vrgatherei16.vv v16, v11, v12
 ; CHECK-NEXT:    li a1, 64
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <64 x i8> %a, <64 x i8> poison, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <64 x i8> %res
@@ -349,10 +352,10 @@ define <16 x i16> @reverse_v16i16(<16 x i16> %a) {
 ; CHECK-LABEL: reverse_v16i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vrsub.vx v10, v10, a1
 ; CHECK-NEXT:    vrgather.vv v13, v8, v10
 ; CHECK-NEXT:    vrgather.vv v12, v9, v10
@@ -368,20 +371,20 @@ define <32 x i16> @reverse_v32i16(<32 x i16> %a) {
 ; CHECK-LABEL: reverse_v32i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 1
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vx v16, v12, a1
-; CHECK-NEXT:    vrgather.vv v15, v8, v16
-; CHECK-NEXT:    vrgather.vv v14, v9, v16
-; CHECK-NEXT:    vrgather.vv v13, v10, v16
-; CHECK-NEXT:    vrgather.vv v12, v11, v16
+; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    addi a0, a0, -32
+; CHECK-NEXT:    vrsub.vx v12, v12, a1
+; CHECK-NEXT:    vrgather.vv v19, v8, v12
+; CHECK-NEXT:    vrgather.vv v18, v9, v12
+; CHECK-NEXT:    vrgather.vv v17, v10, v12
+; CHECK-NEXT:    vrgather.vv v16, v11, v12
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <32 x i16> %a, <32 x i16> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <32 x i16> %res
@@ -430,14 +433,14 @@ define <8 x i32> @reverse_v8i32(<8 x i32> %a) {
 ; CHECK-LABEL: reverse_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vrsub.vx v10, v10, a1
 ; CHECK-NEXT:    vrgather.vv v13, v8, v10
 ; CHECK-NEXT:    vrgather.vv v12, v9, v10
-; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a0, a0, -8
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v12, a0
@@ -450,10 +453,10 @@ define <16 x i32> @reverse_v16i32(<16 x i32> %a) {
 ; CHECK-LABEL: reverse_v16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v12
 ; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v12
 ; CHECK-NEXT:    vrsub.vx v16, v12, a1
 ; CHECK-NEXT:    vrgather.vv v15, v8, v16
 ; CHECK-NEXT:    vrgather.vv v14, v9, v16
@@ -491,14 +494,14 @@ define <4 x i64> @reverse_v4i64(<4 x i64> %a) {
 ; CHECK-LABEL: reverse_v4i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vrsub.vx v10, v10, a1
 ; CHECK-NEXT:    vrgather.vv v13, v8, v10
 ; CHECK-NEXT:    vrgather.vv v12, v9, v10
-; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    addi a0, a0, -4
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v12, a0
@@ -511,19 +514,19 @@ define <8 x i64> @reverse_v8i64(<8 x i64> %a) {
 ; CHECK-LABEL: reverse_v8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vx v16, v12, a1
-; CHECK-NEXT:    vrgather.vv v15, v8, v16
-; CHECK-NEXT:    vrgather.vv v14, v9, v16
-; CHECK-NEXT:    vrgather.vv v13, v10, v16
-; CHECK-NEXT:    vrgather.vv v12, v11, v16
+; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vrsub.vx v12, v12, a1
+; CHECK-NEXT:    vrgather.vv v19, v8, v12
+; CHECK-NEXT:    vrgather.vv v18, v9, v12
+; CHECK-NEXT:    vrgather.vv v17, v10, v12
+; CHECK-NEXT:    vrgather.vv v16, v11, v12
 ; CHECK-NEXT:    addi a0, a0, -8
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i64> %a, <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x i64> %res
@@ -586,10 +589,10 @@ define <16 x half> @reverse_v16f16(<16 x half> %a) {
 ; CHECK-LABEL: reverse_v16f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vrsub.vx v10, v10, a1
 ; CHECK-NEXT:    vrgather.vv v13, v8, v10
 ; CHECK-NEXT:    vrgather.vv v12, v9, v10
@@ -605,20 +608,20 @@ define <32 x half> @reverse_v32f16(<32 x half> %a) {
 ; CHECK-LABEL: reverse_v32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 1
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vx v16, v12, a1
-; CHECK-NEXT:    vrgather.vv v15, v8, v16
-; CHECK-NEXT:    vrgather.vv v14, v9, v16
-; CHECK-NEXT:    vrgather.vv v13, v10, v16
-; CHECK-NEXT:    vrgather.vv v12, v11, v16
+; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    addi a0, a0, -32
+; CHECK-NEXT:    vrsub.vx v12, v12, a1
+; CHECK-NEXT:    vrgather.vv v19, v8, v12
+; CHECK-NEXT:    vrgather.vv v18, v9, v12
+; CHECK-NEXT:    vrgather.vv v17, v10, v12
+; CHECK-NEXT:    vrgather.vv v16, v11, v12
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <32 x half> %a, <32 x half> poison, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <32 x half> %res
@@ -667,14 +670,14 @@ define <8 x float> @reverse_v8f32(<8 x float> %a) {
 ; CHECK-LABEL: reverse_v8f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vrsub.vx v10, v10, a1
 ; CHECK-NEXT:    vrgather.vv v13, v8, v10
 ; CHECK-NEXT:    vrgather.vv v12, v9, v10
-; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a0, a0, -8
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v12, a0
@@ -687,10 +690,10 @@ define <16 x float> @reverse_v16f32(<16 x float> %a) {
 ; CHECK-LABEL: reverse_v16f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v12
 ; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v12
 ; CHECK-NEXT:    vrsub.vx v16, v12, a1
 ; CHECK-NEXT:    vrgather.vv v15, v8, v16
 ; CHECK-NEXT:    vrgather.vv v14, v9, v16
@@ -728,14 +731,14 @@ define <4 x double> @reverse_v4f64(<4 x double> %a) {
 ; CHECK-LABEL: reverse_v4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vrsub.vx v10, v10, a1
 ; CHECK-NEXT:    vrgather.vv v13, v8, v10
 ; CHECK-NEXT:    vrgather.vv v12, v9, v10
-; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    addi a0, a0, -4
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v12, a0
@@ -748,19 +751,19 @@ define <8 x double> @reverse_v8f64(<8 x double> %a) {
 ; CHECK-LABEL: reverse_v8f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vx v16, v12, a1
-; CHECK-NEXT:    vrgather.vv v15, v8, v16
-; CHECK-NEXT:    vrgather.vv v14, v9, v16
-; CHECK-NEXT:    vrgather.vv v13, v10, v16
-; CHECK-NEXT:    vrgather.vv v12, v11, v16
+; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vrsub.vx v12, v12, a1
+; CHECK-NEXT:    vrgather.vv v19, v8, v12
+; CHECK-NEXT:    vrgather.vv v18, v9, v12
+; CHECK-NEXT:    vrgather.vv v17, v10, v12
+; CHECK-NEXT:    vrgather.vv v16, v11, v12
 ; CHECK-NEXT:    addi a0, a0, -8
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x double> %a, <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x double> %res
@@ -946,9 +949,9 @@ define <16 x i8> @reverse_v16i8_2(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vid.v v11
+; CHECK-NEXT:    li a0, 255
 ; CHECK-NEXT:    vrsub.vi v12, v11, 15
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
-; CHECK-NEXT:    li a0, 255
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
@@ -965,25 +968,24 @@ define <32 x i8> @reverse_v32i8_2(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v9
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    addi a1, a0, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vid.v v12
+; CHECK-NEXT:    addi a1, a0, -1
 ; CHECK-NEXT:    vrsub.vx v12, v12, a1
+; CHECK-NEXT:    lui a1, 16
+; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v9, v8, v12
-; CHECK-NEXT:    vrgatherei16.vv v8, v11, v12
+; CHECK-NEXT:    vrgatherei16.vv v15, v8, v12
+; CHECK-NEXT:    vrgatherei16.vv v14, v9, v12
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vid.v v8
 ; CHECK-NEXT:    addi a0, a0, -32
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vi v12, v12, 15
-; CHECK-NEXT:    lui a0, 16
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, mu
+; CHECK-NEXT:    vrsub.vi v12, v8, 15
+; CHECK-NEXT:    vslidedown.vx v8, v14, a0
 ; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <16 x i8> %a, <16 x i8> %b,  <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -1035,21 +1037,21 @@ define <16 x i16> @reverse_v16i16_2(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v9
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    vrsub.vx v9, v9, a1
 ; CHECK-NEXT:    vrgather.vv v13, v8, v9
-; CHECK-NEXT:    vrgather.vv v12, v8, v9
-; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    vrgather.vv v12, v11, v9
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    li a1, 255
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    vrsub.vi v14, v8, 7
+; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    vslidedown.vx v8, v12, a0
-; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vi v12, v12, 7
-; CHECK-NEXT:    li a0, 255
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    vrgather.vv v8, v10, v14, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i16> %a, <8 x i16> %b,  <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <16 x i16> %res
@@ -1058,30 +1060,30 @@ define <16 x i16> @reverse_v16i16_2(<8 x i16> %a, <8 x i16> %b) {
 define <32 x i16> @reverse_v32i16_2(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK-LABEL: reverse_v32i16_2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv2r.v v12, v10
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    lui a1, 16
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vx v12, v12, a1
-; CHECK-NEXT:    vrgather.vv v19, v8, v12
-; CHECK-NEXT:    vrgather.vv v18, v9, v12
-; CHECK-NEXT:    vrgather.vv v16, v8, v12
-; CHECK-NEXT:    vmv2r.v v12, v10
-; CHECK-NEXT:    vmv.v.v v17, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vrsub.vx v10, v10, a1
+; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    vrgather.vv v19, v8, v10
+; CHECK-NEXT:    vrgather.vv v18, v9, v10
+; CHECK-NEXT:    vrgather.vv v16, v11, v10
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vid.v v8
 ; CHECK-NEXT:    addi a0, a0, -32
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vrsub.vi v20, v8, 15
+; CHECK-NEXT:    vmv1r.v v17, v16
 ; CHECK-NEXT:    vslidedown.vx v8, v16, a0
-; CHECK-NEXT:    vid.v v16
-; CHECK-NEXT:    vrsub.vi v16, v16, 15
-; CHECK-NEXT:    lui a0, 16
-; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v12, v16, v0.t
+; CHECK-NEXT:    vrgather.vv v8, v12, v20, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <16 x i16> %a, <16 x i16> %b,  <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <32 x i16> %res
@@ -1116,23 +1118,22 @@ define <8 x i32> @reverse_v8i32_2(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v9
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    vrsub.vx v9, v9, a1
 ; CHECK-NEXT:    vrgather.vv v13, v8, v9
-; CHECK-NEXT:    vrgather.vv v12, v8, v9
+; CHECK-NEXT:    vrgather.vv v12, v11, v9
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    vrsub.vi v14, v8, 3
 ; CHECK-NEXT:    addi a0, a0, -8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vrsub.vi v12, v11, 3
-; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vrgatherei16.vv v8, v10, v14, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x i32> %res
@@ -1141,26 +1142,25 @@ define <8 x i32> @reverse_v8i32_2(<4 x i32> %a, <4 x i32> %b) {
 define <16 x i32> @reverse_v16i32_2(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-LABEL: reverse_v16i32_2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv2r.v v12, v10
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vx v12, v12, a1
-; CHECK-NEXT:    vrgather.vv v19, v8, v12
-; CHECK-NEXT:    vrgather.vv v18, v9, v12
-; CHECK-NEXT:    vrgather.vv v16, v8, v12
-; CHECK-NEXT:    vmv2r.v v12, v10
-; CHECK-NEXT:    vmv.v.v v17, v16
-; CHECK-NEXT:    addi a0, a0, -16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vrsub.vx v14, v10, a1
+; CHECK-NEXT:    vrgather.vv v11, v8, v14
+; CHECK-NEXT:    vrgather.vv v10, v9, v14
+; CHECK-NEXT:    vrgather.vv v8, v9, v14
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vid.v v14
+; CHECK-NEXT:    li a1, 255
+; CHECK-NEXT:    addi a0, a0, -16
 ; CHECK-NEXT:    vrsub.vi v16, v14, 7
-; CHECK-NEXT:    li a0, 255
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    vmv1r.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i32> %a, <8 x i32> %b,  <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -1170,33 +1170,32 @@ define <16 x i32> @reverse_v16i32_2(<8 x i32> %a, <8 x i32> %b) {
 define <32 x i32> @reverse_v32i32_2(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: reverse_v32i32_2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv4r.v v16, v12
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v12
 ; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v16
-; CHECK-NEXT:    vrsub.vx v17, v16, a1
-; CHECK-NEXT:    vrgather.vv v23, v8, v17
-; CHECK-NEXT:    vrgather.vv v22, v9, v17
-; CHECK-NEXT:    vrgather.vv v21, v10, v17
-; CHECK-NEXT:    vrgather.vv v20, v11, v17
-; CHECK-NEXT:    vrgather.vv v16, v8, v17
-; CHECK-NEXT:    vmv.v.v v17, v16
-; CHECK-NEXT:    vmv4r.v v24, v12
-; CHECK-NEXT:    vmv2r.v v18, v16
+; CHECK-NEXT:    vrsub.vx v20, v12, a1
+; CHECK-NEXT:    vrgather.vv v15, v8, v20
+; CHECK-NEXT:    vrgather.vv v14, v9, v20
+; CHECK-NEXT:    vrgather.vv v13, v10, v20
+; CHECK-NEXT:    vrgather.vv v12, v11, v20
+; CHECK-NEXT:    lui a1, 16
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    vrgather.vv v8, v9, v20
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vid.v v20
 ; CHECK-NEXT:    addi a0, a0, -32
-; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vid.v v16
-; CHECK-NEXT:    vrsub.vi v16, v16, 15
-; CHECK-NEXT:    lui a0, 16
-; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vmv1r.v v9, v8
+; CHECK-NEXT:    vrsub.vi v24, v20, 15
+; CHECK-NEXT:    vmv2r.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vrgatherei16.vv v8, v16, v24, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <16 x i32> %a, <16 x i32> %b,  <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <32 x i32> %res
@@ -1220,26 +1219,27 @@ define <4 x i64> @reverse_v4i64_2(<2 x i64> %a, < 2 x i64> %b) {
 define <8 x i64> @reverse_v8i64_2(<4 x i64> %a, <4 x i64> %b) {
 ; CHECK-LABEL: reverse_v8i64_2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv2r.v v12, v10
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vx v12, v12, a1
-; CHECK-NEXT:    vrgather.vv v19, v8, v12
-; CHECK-NEXT:    vrgather.vv v18, v9, v12
-; CHECK-NEXT:    vrgather.vv v16, v8, v12
-; CHECK-NEXT:    vmv2r.v v12, v10
-; CHECK-NEXT:    vmv.v.v v17, v16
+; CHECK-NEXT:    vrsub.vx v14, v10, a1
+; CHECK-NEXT:    vrgather.vv v11, v8, v14
+; CHECK-NEXT:    vrgather.vv v10, v9, v14
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v15
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vrgather.vv v8, v9, v14
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v15, 3
 ; CHECK-NEXT:    addi a0, a0, -8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v14
-; CHECK-NEXT:    vrsub.vi v16, v14, 3
-; CHECK-NEXT:    vmv.v.i v0, 15
+; CHECK-NEXT:    vmv1r.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %a, <4 x i64> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -1291,21 +1291,21 @@ define <16 x half> @reverse_v16f16_2(<8 x half> %a, <8 x half> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v9
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    vrsub.vx v9, v9, a1
 ; CHECK-NEXT:    vrgather.vv v13, v8, v9
-; CHECK-NEXT:    vrgather.vv v12, v8, v9
-; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    vrgather.vv v12, v11, v9
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    li a1, 255
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    vrsub.vi v14, v8, 7
+; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    vslidedown.vx v8, v12, a0
-; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vi v12, v12, 7
-; CHECK-NEXT:    li a0, 255
-; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vrgather.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    vrgather.vv v8, v10, v14, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x half> %a, <8 x half> %b,  <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <16 x half> %res
@@ -1315,20 +1315,20 @@ define <32 x half> @reverse_v32f16_2(<16 x half> %a) {
 ; CHECK-LABEL: reverse_v32f16_2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrsub.vx v10, v10, a1
-; CHECK-NEXT:    vrgather.vv v15, v8, v10
-; CHECK-NEXT:    vrgather.vv v14, v9, v10
-; CHECK-NEXT:    vrgather.vv v12, v8, v10
-; CHECK-NEXT:    vmv.v.v v13, v12
+; CHECK-NEXT:    vrsub.vx v12, v10, a1
+; CHECK-NEXT:    vrgather.vv v11, v8, v12
+; CHECK-NEXT:    vrgather.vv v10, v9, v12
+; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    vrgather.vv v8, v9, v12
 ; CHECK-NEXT:    addi a0, a0, -32
-; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vmv.v.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    ret
   %res = shufflevector <16 x half> %a, <16 x half> poison,  <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <32 x half> %res
@@ -1363,23 +1363,22 @@ define <8 x float> @reverse_v8f32_2(<4 x float> %a, <4 x float> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v9
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    vrsub.vx v9, v9, a1
 ; CHECK-NEXT:    vrgather.vv v13, v8, v9
-; CHECK-NEXT:    vrgather.vv v12, v8, v9
+; CHECK-NEXT:    vrgather.vv v12, v11, v9
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    vrsub.vi v14, v8, 3
 ; CHECK-NEXT:    addi a0, a0, -8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v12, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vrsub.vi v12, v11, 3
-; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v10, v12, v0.t
+; CHECK-NEXT:    vslidedown.vx v8, v12, a0
+; CHECK-NEXT:    vrgatherei16.vv v8, v10, v14, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret <8 x float> %res
@@ -1388,26 +1387,25 @@ define <8 x float> @reverse_v8f32_2(<4 x float> %a, <4 x float> %b) {
 define <16 x float> @reverse_v16f32_2(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: reverse_v16f32_2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv2r.v v12, v10
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vx v12, v12, a1
-; CHECK-NEXT:    vrgather.vv v19, v8, v12
-; CHECK-NEXT:    vrgather.vv v18, v9, v12
-; CHECK-NEXT:    vrgather.vv v16, v8, v12
-; CHECK-NEXT:    vmv2r.v v12, v10
-; CHECK-NEXT:    vmv.v.v v17, v16
-; CHECK-NEXT:    addi a0, a0, -16
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vrsub.vx v14, v10, a1
+; CHECK-NEXT:    vrgather.vv v11, v8, v14
+; CHECK-NEXT:    vrgather.vv v10, v9, v14
+; CHECK-NEXT:    vrgather.vv v8, v9, v14
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vid.v v14
+; CHECK-NEXT:    li a1, 255
+; CHECK-NEXT:    addi a0, a0, -16
 ; CHECK-NEXT:    vrsub.vi v16, v14, 7
-; CHECK-NEXT:    li a0, 255
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.s.x v0, a1
+; CHECK-NEXT:    vmv1r.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -1432,26 +1430,27 @@ define <4 x double> @reverse_v4f64_2(<2 x double> %a, < 2 x double> %b) {
 define <8 x double> @reverse_v8f64_2(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: reverse_v8f64_2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv2r.v v12, v10
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vrsub.vx v12, v12, a1
-; CHECK-NEXT:    vrgather.vv v19, v8, v12
-; CHECK-NEXT:    vrgather.vv v18, v9, v12
-; CHECK-NEXT:    vrgather.vv v16, v8, v12
-; CHECK-NEXT:    vmv2r.v v12, v10
-; CHECK-NEXT:    vmv.v.v v17, v16
+; CHECK-NEXT:    vrsub.vx v14, v10, a1
+; CHECK-NEXT:    vrgather.vv v11, v8, v14
+; CHECK-NEXT:    vrgather.vv v10, v9, v14
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v15
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vrgather.vv v8, v9, v14
+; CHECK-NEXT:    vmv.v.i v0, 15
 ; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v16, v15, 3
 ; CHECK-NEXT:    addi a0, a0, -8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v16, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v14
-; CHECK-NEXT:    vrsub.vi v16, v14, 3
-; CHECK-NEXT:    vmv.v.i v0, 15
+; CHECK-NEXT:    vmv1r.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    vrgatherei16.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -1464,18 +1463,19 @@ define <256 x i1> @reverse_v256i1(<256 x i1> %a) vscale_range(16, 1024) {
 ; CHECK-LABEL: reverse_v256i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 256
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    addi a2, a1, -1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrsub.vx v10, v10, a2
+; CHECK-NEXT:    vrsub.vx v8, v8, a2
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v13, v8, v10
-; CHECK-NEXT:    vrgatherei16.vv v12, v9, v10
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    vrgatherei16.vv v13, v10, v8
+; CHECK-NEXT:    vrgatherei16.vv v12, v11, v8
 ; CHECK-NEXT:    addi a1, a1, -256
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v12, a1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
index 3cfcb4398a1f00..02355d331e13f6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
@@ -482,15 +482,15 @@ define <8 x i16> @shuffle_v8i16_as_i64_16(<8 x i16> %v) {
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 0
 ; RV32-NEXT:    li a0, 48
-; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v9, v10, a1
-; RV32-NEXT:    vsrl.vv v9, v8, v9
-; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vand.vx v10, v10, a1
-; RV32-NEXT:    vsll.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vand.vx v9, v9, a1
+; RV32-NEXT:    vsrl.vv v10, v8, v10
+; RV32-NEXT:    vsll.vv v8, v8, v9
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8i16_as_i64_16:
@@ -528,15 +528,15 @@ define <8 x i16> @shuffle_v8i16_as_i64_32(<8 x i16> %v) {
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 0
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v9, v10, a1
-; RV32-NEXT:    vsrl.vv v9, v8, v9
-; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vand.vx v10, v10, a1
-; RV32-NEXT:    vsll.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vand.vx v9, v9, a1
+; RV32-NEXT:    vsrl.vv v10, v8, v10
+; RV32-NEXT:    vsll.vv v8, v8, v9
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8i16_as_i64_32:
@@ -574,15 +574,15 @@ define <8 x i16> @shuffle_v8i16_as_i64_48(<8 x i16> %v) {
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 0
 ; RV32-NEXT:    li a0, 16
-; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v9, v10, a1
-; RV32-NEXT:    vsrl.vv v9, v8, v9
-; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vand.vx v10, v10, a1
-; RV32-NEXT:    vsll.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vand.vx v9, v9, a1
+; RV32-NEXT:    vsrl.vv v10, v8, v10
+; RV32-NEXT:    vsll.vv v8, v8, v9
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8i16_as_i64_48:
@@ -620,15 +620,15 @@ define <8 x i32> @shuffle_v8i32_as_i64(<8 x i32> %v) {
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, 0
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vwsubu.vx v12, v10, a0
 ; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v12, v10, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v10, v12, a1
-; RV32-NEXT:    vsrl.vv v10, v8, v10
-; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vmv.v.x v10, a0
 ; RV32-NEXT:    vand.vx v12, v12, a1
-; RV32-NEXT:    vsll.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vand.vx v10, v10, a1
+; RV32-NEXT:    vsrl.vv v12, v8, v12
+; RV32-NEXT:    vsll.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8i32_as_i64:
@@ -691,15 +691,15 @@ define <8 x half> @shuffle_v8f16_as_i64_16(<8 x half> %v) {
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 0
 ; RV32-NEXT:    li a0, 48
-; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v9, v10, a1
-; RV32-NEXT:    vsrl.vv v9, v8, v9
-; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vand.vx v10, v10, a1
-; RV32-NEXT:    vsll.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vand.vx v9, v9, a1
+; RV32-NEXT:    vsrl.vv v10, v8, v10
+; RV32-NEXT:    vsll.vv v8, v8, v9
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8f16_as_i64_16:
@@ -737,15 +737,15 @@ define <8 x half> @shuffle_v8f16_as_i64_32(<8 x half> %v) {
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 0
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v9, v10, a1
-; RV32-NEXT:    vsrl.vv v9, v8, v9
-; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vand.vx v10, v10, a1
-; RV32-NEXT:    vsll.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vand.vx v9, v9, a1
+; RV32-NEXT:    vsrl.vv v10, v8, v10
+; RV32-NEXT:    vsll.vv v8, v8, v9
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8f16_as_i64_32:
@@ -783,15 +783,15 @@ define <8 x half> @shuffle_v8f16_as_i64_48(<8 x half> %v) {
 ; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 0
 ; RV32-NEXT:    li a0, 16
-; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV32-NEXT:    vand.vx v9, v10, a1
-; RV32-NEXT:    vsrl.vv v9, v8, v9
-; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vmv.v.x v9, a0
 ; RV32-NEXT:    vand.vx v10, v10, a1
-; RV32-NEXT:    vsll.vv v8, v8, v10
-; RV32-NEXT:    vor.vv v8, v8, v9
+; RV32-NEXT:    vand.vx v9, v9, a1
+; RV32-NEXT:    vsrl.vv v10, v8, v10
+; RV32-NEXT:    vsll.vv v8, v8, v9
+; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8f16_as_i64_48:
@@ -829,15 +829,15 @@ define <8 x float> @shuffle_v8f32_as_i64(<8 x float> %v) {
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, 0
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vwsubu.vx v12, v10, a0
 ; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v12, v10, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v10, v12, a1
-; RV32-NEXT:    vsrl.vv v10, v8, v10
-; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vmv.v.x v10, a0
 ; RV32-NEXT:    vand.vx v12, v12, a1
-; RV32-NEXT:    vsll.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vand.vx v10, v10, a1
+; RV32-NEXT:    vsrl.vv v12, v8, v12
+; RV32-NEXT:    vsll.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8f32_as_i64:
@@ -876,15 +876,15 @@ define <8 x float> @shuffle_v8f32_as_i64_exact(<8 x float> %v) vscale_range(2,2)
 ; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, 0
 ; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vwsubu.vx v12, v10, a0
 ; RV32-NEXT:    li a1, 63
+; RV32-NEXT:    vwsubu.vx v12, v10, a0
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT:    vand.vx v10, v12, a1
-; RV32-NEXT:    vsrl.vv v10, v8, v10
-; RV32-NEXT:    vmv.v.x v12, a0
+; RV32-NEXT:    vmv.v.x v10, a0
 ; RV32-NEXT:    vand.vx v12, v12, a1
-; RV32-NEXT:    vsll.vv v8, v8, v12
-; RV32-NEXT:    vor.vv v8, v8, v10
+; RV32-NEXT:    vand.vx v10, v10, a1
+; RV32-NEXT:    vsrl.vv v12, v8, v12
+; RV32-NEXT:    vsll.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: shuffle_v8f32_as_i64_exact:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
index 15c2c2298c0dd6..8f6240e112cdd7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
@@ -386,22 +386,23 @@ define void @vnsrl_0_i8_undef3(ptr %in, ptr %out) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetivli zero, 16, e8, mf2, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    li a0, -32
+; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    lui a0, 24640
-; CHECK-NEXT:    addi a0, a0, 6
-; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
 ; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    addi a0, a0, 6
 ; CHECK-NEXT:    vadd.vv v9, v9, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vadd.vi v9, v9, -8
-; CHECK-NEXT:    li a0, -32
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vrgather.vv v11, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vse8.v v10, (a1)
+; CHECK-NEXT:    vrgather.vv v11, v8, v9, v0.t
+; CHECK-NEXT:    vse8.v v11, (a1)
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i8>, ptr %in, align 1
@@ -419,18 +420,18 @@ define void @vnsrl_0_i8_undef_negative(ptr %in, ptr %out) {
 ; CHECK-NEXT:    lui a0, %hi(.LCPI17_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI17_0)
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf4, ta, ma
-; CHECK-NEXT:    vle8.v v9, (a0)
-; CHECK-NEXT:    vrgather.vv v10, v8, v9
 ; CHECK-NEXT:    vid.v v9
-; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vadd.vi v9, v9, -8
+; CHECK-NEXT:    vle8.v v10, (a0)
 ; CHECK-NEXT:    li a0, 48
+; CHECK-NEXT:    vadd.vv v9, v9, v9
 ; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vadd.vi v9, v9, -8
+; CHECK-NEXT:    vrgather.vv v11, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 8
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf4, ta, mu
-; CHECK-NEXT:    vrgather.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vse8.v v10, (a1)
+; CHECK-NEXT:    vrgather.vv v11, v8, v9, v0.t
+; CHECK-NEXT:    vse8.v v11, (a1)
 ; CHECK-NEXT:    ret
 entry:
   %0 = load <16 x i8>, ptr %in, align 1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
index 5232d0d69fad08..74f2cec04f0de9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll
@@ -88,17 +88,17 @@ define void @store_v6i1(ptr %p, <6 x i1> %v) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vfirst.m a1, v0
-; CHECK-NEXT:    seqz a1, a1
 ; CHECK-NEXT:    vmv.x.s a2, v0
+; CHECK-NEXT:    seqz a1, a1
 ; CHECK-NEXT:    andi a3, a2, 2
+; CHECK-NEXT:    andi a4, a2, 4
 ; CHECK-NEXT:    or a1, a1, a3
-; CHECK-NEXT:    andi a3, a2, 4
-; CHECK-NEXT:    andi a4, a2, 8
-; CHECK-NEXT:    or a3, a3, a4
-; CHECK-NEXT:    or a1, a1, a3
-; CHECK-NEXT:    andi a3, a2, 16
+; CHECK-NEXT:    andi a3, a2, 8
+; CHECK-NEXT:    or a3, a4, a3
+; CHECK-NEXT:    andi a4, a2, 16
 ; CHECK-NEXT:    andi a2, a2, -32
-; CHECK-NEXT:    or a2, a3, a2
+; CHECK-NEXT:    or a1, a1, a3
+; CHECK-NEXT:    or a2, a4, a2
 ; CHECK-NEXT:    or a1, a1, a2
 ; CHECK-NEXT:    andi a1, a1, 63
 ; CHECK-NEXT:    sb a1, 0(a0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
index ed72883e9d0523..ddde1e94abbde9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
@@ -74,9 +74,9 @@ define void @widen_4xv4i16_unaligned(ptr %x, ptr %z) {
 ; CHECK-NO-MISALIGN:       # %bb.0:
 ; CHECK-NO-MISALIGN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NO-MISALIGN-NEXT:    vle8.v v8, (a0)
+; CHECK-NO-MISALIGN-NEXT:    addi a2, a0, 16
+; CHECK-NO-MISALIGN-NEXT:    vle8.v v10, (a2)
 ; CHECK-NO-MISALIGN-NEXT:    addi a2, a0, 8
-; CHECK-NO-MISALIGN-NEXT:    addi a3, a0, 16
-; CHECK-NO-MISALIGN-NEXT:    vle8.v v10, (a3)
 ; CHECK-NO-MISALIGN-NEXT:    addi a0, a0, 24
 ; CHECK-NO-MISALIGN-NEXT:    vle8.v v9, (a0)
 ; CHECK-NO-MISALIGN-NEXT:    vle8.v v11, (a2)
@@ -185,9 +185,9 @@ define void @strided_constant_mismatch_4xv4i16(ptr %x, ptr %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    addi a2, a0, 6
+; CHECK-NEXT:    vle16.v v10, (a2)
 ; CHECK-NEXT:    addi a2, a0, 2
-; CHECK-NEXT:    addi a3, a0, 6
-; CHECK-NEXT:    vle16.v v10, (a3)
 ; CHECK-NEXT:    addi a0, a0, 8
 ; CHECK-NEXT:    vle16.v v9, (a0)
 ; CHECK-NEXT:    vle16.v v11, (a2)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
index 28202dc07f9564..d506842b5eff67 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll
@@ -54,11 +54,11 @@ define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; CHECK-LABEL: gather_masked:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addi a2, a0, 1024
-; CHECK-NEXT:    lui a3, 983765
-; CHECK-NEXT:    addi a3, a3, 873
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a3
+; CHECK-NEXT:    lui a4, 983765
 ; CHECK-NEXT:    li a3, 32
+; CHECK-NEXT:    addi a4, a4, 873
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v0, a4
 ; CHECK-NEXT:    li a4, 5
 ; CHECK-NEXT:  .LBB1_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -303,9 +303,9 @@ define void @scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vlse8.v v9, (a0), a3
+; CHECK-NEXT:    addi a1, a1, 32
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsse8.v v8, (a0), a3
-; CHECK-NEXT:    addi a1, a1, 32
 ; CHECK-NEXT:    addi a0, a0, 160
 ; CHECK-NEXT:    bne a1, a2, .LBB6_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -348,9 +348,9 @@ define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture read
 ; CHECK-NEXT:    vle8.v v9, (a1)
 ; CHECK-NEXT:    vmv1r.v v10, v8
 ; CHECK-NEXT:    vlse8.v v10, (a0), a4, v0.t
+; CHECK-NEXT:    addi a1, a1, 32
 ; CHECK-NEXT:    vadd.vv v9, v10, v9
 ; CHECK-NEXT:    vsse8.v v9, (a0), a4, v0.t
-; CHECK-NEXT:    addi a1, a1, 32
 ; CHECK-NEXT:    addi a0, a0, 160
 ; CHECK-NEXT:    bne a1, a2, .LBB7_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -384,14 +384,14 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
 ; CHECK-LABEL: gather_pow2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    li a3, 16
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    li a4, 32
 ; CHECK-NEXT:  .LBB8_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
-; CHECK-NEXT:    vlse32.v v8, (a1), a3
+; CHECK-NEXT:    vlse32.v v8, (a1), a2
 ; CHECK-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v9, (a0)
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
@@ -400,7 +400,7 @@ define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonl
 ; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 32
 ; CHECK-NEXT:    addi a1, a1, 128
-; CHECK-NEXT:    bne a0, a2, .LBB8_1
+; CHECK-NEXT:    bne a0, a3, .LBB8_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -432,21 +432,21 @@ for.cond.cleanup:                                 ; preds = %vector.body
 define void @scatter_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
 ; CHECK-LABEL: scatter_pow2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lui a2, 1
-; CHECK-NEXT:    add a2, a1, a2
-; CHECK-NEXT:    li a3, 32
+; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    li a2, 32
+; CHECK-NEXT:    add a3, a1, a3
 ; CHECK-NEXT:    li a4, 16
 ; CHECK-NEXT:  .LBB9_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a1)
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
 ; CHECK-NEXT:    vlse32.v v9, (a0), a4
+; CHECK-NEXT:    addi a1, a1, 32
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsse32.v v8, (a0), a4
-; CHECK-NEXT:    addi a1, a1, 32
 ; CHECK-NEXT:    addi a0, a0, 128
-; CHECK-NEXT:    bne a1, a2, .LBB9_1
+; CHECK-NEXT:    bne a1, a3, .LBB9_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
 ; CHECK-NEXT:    ret
 entry:
@@ -491,14 +491,14 @@ define void @struct_gather(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    addi a4, a0, 32
 ; CHECK-NEXT:    addi a5, a1, -128
-; CHECK-NEXT:    vlse32.v v8, (a5), a3
-; CHECK-NEXT:    vlse32.v v9, (a1), a3
-; CHECK-NEXT:    vle32.v v10, (a0)
+; CHECK-NEXT:    vlse32.v v8, (a1), a3
+; CHECK-NEXT:    vle32.v v9, (a0)
+; CHECK-NEXT:    vlse32.v v10, (a5), a3
 ; CHECK-NEXT:    vle32.v v11, (a4)
-; CHECK-NEXT:    vadd.vv v8, v10, v8
-; CHECK-NEXT:    vadd.vv v9, v11, v9
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    vse32.v v9, (a4)
+; CHECK-NEXT:    vadd.vv v9, v9, v10
+; CHECK-NEXT:    vadd.vv v8, v11, v8
+; CHECK-NEXT:    vse32.v v9, (a0)
+; CHECK-NEXT:    vse32.v v8, (a4)
 ; CHECK-NEXT:    addi a0, a0, 64
 ; CHECK-NEXT:    addi a1, a1, 256
 ; CHECK-NEXT:    bne a0, a2, .LBB10_1
@@ -551,9 +551,9 @@ define void @gather_unroll(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vlse32.v v8, (a1), a3
 ; CHECK-NEXT:    vlse32.v v9, (a0), a4
+; CHECK-NEXT:    addi a5, a1, 16
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    vsse32.v v8, (a0), a4
-; CHECK-NEXT:    addi a5, a1, 16
 ; CHECK-NEXT:    vlse32.v v8, (a5), a3
 ; CHECK-NEXT:    addi a5, a0, 4
 ; CHECK-NEXT:    vlse32.v v9, (a5), a4
@@ -569,10 +569,10 @@ define void @gather_unroll(ptr noalias nocapture %A, ptr noalias nocapture reado
 ; CHECK-NEXT:    vlse32.v v8, (a5), a3
 ; CHECK-NEXT:    addi a5, a0, 12
 ; CHECK-NEXT:    vlse32.v v9, (a5), a4
-; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vsse32.v v8, (a5), a4
 ; CHECK-NEXT:    addi a2, a2, -8
 ; CHECK-NEXT:    addi a1, a1, 512
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    vsse32.v v8, (a5), a4
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    bnez a2, .LBB11_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -638,13 +638,13 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur
 ; V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; V-NEXT:  .LBB12_1: # %bb2
 ; V-NEXT:    # =>This Inner Loop Header: Depth=1
-; V-NEXT:    addi a4, a1, 80
 ; V-NEXT:    vlse64.v v8, (a1), a3
+; V-NEXT:    addi a4, a1, 80
 ; V-NEXT:    vlse64.v v9, (a4), a3
 ; V-NEXT:    addi a4, a0, 16
 ; V-NEXT:    vse64.v v8, (a0)
-; V-NEXT:    vse64.v v9, (a4)
 ; V-NEXT:    addi a0, a0, 32
+; V-NEXT:    vse64.v v9, (a4)
 ; V-NEXT:    addi a1, a1, 160
 ; V-NEXT:    bne a0, a2, .LBB12_1
 ; V-NEXT:  # %bb.2: # %bb18
@@ -653,15 +653,16 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur
 ; ZVE32F-LABEL: gather_of_pointers:
 ; ZVE32F:       # %bb.0: # %bb
 ; ZVE32F-NEXT:    li a2, 0
-; ZVE32F-NEXT:    lui a3, 2
-; ZVE32F-NEXT:    add a3, a0, a3
-; ZVE32F-NEXT:    li a4, 1
+; ZVE32F-NEXT:    lui a4, 2
+; ZVE32F-NEXT:    li a3, 1
+; ZVE32F-NEXT:    add a4, a0, a4
 ; ZVE32F-NEXT:    li a5, 40
 ; ZVE32F-NEXT:  .LBB12_1: # %bb2
 ; ZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
-; ZVE32F-NEXT:    mul a6, a4, a5
-; ZVE32F-NEXT:    add a6, a1, a6
+; ZVE32F-NEXT:    mul a6, a3, a5
 ; ZVE32F-NEXT:    mul a7, a2, a5
+; ZVE32F-NEXT:    addi a2, a2, 4
+; ZVE32F-NEXT:    add a6, a1, a6
 ; ZVE32F-NEXT:    add a7, a1, a7
 ; ZVE32F-NEXT:    ld t0, 0(a7)
 ; ZVE32F-NEXT:    ld t1, 0(a6)
@@ -671,10 +672,9 @@ define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptur
 ; ZVE32F-NEXT:    sd t1, 8(a0)
 ; ZVE32F-NEXT:    sd a7, 16(a0)
 ; ZVE32F-NEXT:    sd a6, 24(a0)
-; ZVE32F-NEXT:    addi a2, a2, 4
 ; ZVE32F-NEXT:    addi a0, a0, 32
-; ZVE32F-NEXT:    addi a4, a4, 4
-; ZVE32F-NEXT:    bne a0, a3, .LBB12_1
+; ZVE32F-NEXT:    addi a3, a3, 4
+; ZVE32F-NEXT:    bne a0, a4, .LBB12_1
 ; ZVE32F-NEXT:  # %bb.2: # %bb18
 ; ZVE32F-NEXT:    ret
 bb:
@@ -719,9 +719,9 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; V-NEXT:    vle64.v v8, (a1)
 ; V-NEXT:    vle64.v v9, (a4)
 ; V-NEXT:    addi a4, a0, 80
+; V-NEXT:    addi a1, a1, 32
 ; V-NEXT:    vsse64.v v8, (a0), a3
 ; V-NEXT:    vsse64.v v9, (a4), a3
-; V-NEXT:    addi a1, a1, 32
 ; V-NEXT:    addi a0, a0, 160
 ; V-NEXT:    bne a1, a2, .LBB13_1
 ; V-NEXT:  # %bb.2: # %bb18
@@ -730,9 +730,9 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; ZVE32F-LABEL: scatter_of_pointers:
 ; ZVE32F:       # %bb.0: # %bb
 ; ZVE32F-NEXT:    li a2, 0
-; ZVE32F-NEXT:    lui a3, 2
-; ZVE32F-NEXT:    add a3, a1, a3
-; ZVE32F-NEXT:    li a4, 1
+; ZVE32F-NEXT:    lui a4, 2
+; ZVE32F-NEXT:    li a3, 1
+; ZVE32F-NEXT:    add a4, a1, a4
 ; ZVE32F-NEXT:    li a5, 40
 ; ZVE32F-NEXT:  .LBB13_1: # %bb2
 ; ZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -740,18 +740,18 @@ define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocaptu
 ; ZVE32F-NEXT:    ld a7, 8(a1)
 ; ZVE32F-NEXT:    ld t0, 16(a1)
 ; ZVE32F-NEXT:    ld t1, 24(a1)
-; ZVE32F-NEXT:    mul t2, a4, a5
-; ZVE32F-NEXT:    add t2, a0, t2
+; ZVE32F-NEXT:    mul t2, a3, a5
 ; ZVE32F-NEXT:    mul t3, a2, a5
+; ZVE32F-NEXT:    addi a2, a2, 4
+; ZVE32F-NEXT:    addi a1, a1, 32
+; ZVE32F-NEXT:    add t2, a0, t2
 ; ZVE32F-NEXT:    add t3, a0, t3
 ; ZVE32F-NEXT:    sd a6, 0(t3)
 ; ZVE32F-NEXT:    sd a7, 0(t2)
 ; ZVE32F-NEXT:    sd t0, 80(t3)
 ; ZVE32F-NEXT:    sd t1, 80(t2)
-; ZVE32F-NEXT:    addi a2, a2, 4
-; ZVE32F-NEXT:    addi a1, a1, 32
-; ZVE32F-NEXT:    addi a4, a4, 4
-; ZVE32F-NEXT:    bne a1, a3, .LBB13_1
+; ZVE32F-NEXT:    addi a3, a3, 4
+; ZVE32F-NEXT:    bne a1, a4, .LBB13_1
 ; ZVE32F-NEXT:  # %bb.2: # %bb18
 ; ZVE32F-NEXT:    ret
 bb:
@@ -794,36 +794,36 @@ define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, pt
 ; CHECK-NEXT:    mv a4, a2
 ; CHECK-NEXT:    bltu a5, a6, .LBB14_5
 ; CHECK-NEXT:  # %bb.2: # %bb9
-; CHECK-NEXT:    slli a5, a5, 32
-; CHECK-NEXT:    srli a5, a5, 32
-; CHECK-NEXT:    addi a5, a5, 1
-; CHECK-NEXT:    andi a6, a5, -32
-; CHECK-NEXT:    add a4, a6, a2
+; CHECK-NEXT:    slli a4, a5, 32
 ; CHECK-NEXT:    slli t0, a2, 2
-; CHECK-NEXT:    add a7, a0, a2
-; CHECK-NEXT:    add a2, a1, a2
-; CHECK-NEXT:    add a2, a2, t0
-; CHECK-NEXT:    add t0, a4, a0
+; CHECK-NEXT:    add a5, a0, a2
+; CHECK-NEXT:    add a6, a1, a2
 ; CHECK-NEXT:    li t2, 32
+; CHECK-NEXT:    srli a4, a4, 32
+; CHECK-NEXT:    add t0, a6, t0
+; CHECK-NEXT:    addi a6, a4, 1
+; CHECK-NEXT:    andi a7, a6, -32
+; CHECK-NEXT:    add a4, a7, a2
+; CHECK-NEXT:    add a2, a4, a0
 ; CHECK-NEXT:    li t1, 5
 ; CHECK-NEXT:    vsetvli zero, t2, e8, m1, ta, ma
 ; CHECK-NEXT:  .LBB14_3: # %bb15
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vlse8.v v8, (a2), t1
-; CHECK-NEXT:    vle8.v v9, (a7)
+; CHECK-NEXT:    vlse8.v v8, (t0), t1
+; CHECK-NEXT:    vle8.v v9, (a5)
 ; CHECK-NEXT:    vadd.vv v8, v9, v8
-; CHECK-NEXT:    vse8.v v8, (a7)
-; CHECK-NEXT:    addi a7, a7, 32
-; CHECK-NEXT:    addi a2, a2, 160
-; CHECK-NEXT:    bne a7, t0, .LBB14_3
+; CHECK-NEXT:    vse8.v v8, (a5)
+; CHECK-NEXT:    addi a5, a5, 32
+; CHECK-NEXT:    addi t0, t0, 160
+; CHECK-NEXT:    bne a5, a2, .LBB14_3
 ; CHECK-NEXT:  # %bb.4: # %bb30
-; CHECK-NEXT:    beq a5, a6, .LBB14_7
+; CHECK-NEXT:    beq a6, a7, .LBB14_7
 ; CHECK-NEXT:  .LBB14_5: # %bb32
 ; CHECK-NEXT:    add a2, a0, a4
 ; CHECK-NEXT:    slli a5, a4, 2
 ; CHECK-NEXT:    add a1, a1, a4
-; CHECK-NEXT:    add a1, a1, a5
 ; CHECK-NEXT:    subw a3, a3, a4
+; CHECK-NEXT:    add a1, a1, a5
 ; CHECK-NEXT:    slli a3, a3, 32
 ; CHECK-NEXT:    srli a3, a3, 32
 ; CHECK-NEXT:    add a0, a4, a0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
index 47efa058df641a..1c2c90478a1f77 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
@@ -542,20 +542,20 @@ declare <3 x double> @llvm.experimental.vp.strided.load.v3f64.p0.i32(ptr, i32, <
 define <32 x double> @strided_vpload_v32f64(ptr %ptr, i32 signext %stride, <32 x i1> %m, i32 zeroext %evl) nounwind {
 ; CHECK-LABEL: strided_vpload_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a4, 16
 ; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    li a4, 16
 ; CHECK-NEXT:    mv a3, a2
 ; CHECK-NEXT:    bltu a2, a4, .LBB45_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a3, 16
 ; CHECK-NEXT:  .LBB45_2:
 ; CHECK-NEXT:    mul a4, a3, a1
-; CHECK-NEXT:    add a4, a0, a4
 ; CHECK-NEXT:    addi a5, a2, -16
-; CHECK-NEXT:    sltu a2, a2, a5
-; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v8, v9, 2
+; CHECK-NEXT:    add a4, a0, a4
+; CHECK-NEXT:    sltu a2, a2, a5
+; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a5
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
@@ -578,8 +578,8 @@ define <32 x double> @strided_vpload_v32f64_allones_mask(ptr %ptr, i32 signext %
 ; CHECK-NEXT:    li a3, 16
 ; CHECK-NEXT:  .LBB46_2:
 ; CHECK-NEXT:    mul a4, a3, a1
-; CHECK-NEXT:    add a4, a0, a4
 ; CHECK-NEXT:    addi a5, a2, -16
+; CHECK-NEXT:    add a4, a0, a4
 ; CHECK-NEXT:    sltu a2, a2, a5
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a5
@@ -598,8 +598,8 @@ declare <32 x double> @llvm.experimental.vp.strided.load.v32f64.p0.i32(ptr, i32,
 define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-RV32-LABEL: strided_load_v33f64:
 ; CHECK-RV32:       # %bb.0:
-; CHECK-RV32-NEXT:    li a5, 32
 ; CHECK-RV32-NEXT:    vmv1r.v v8, v0
+; CHECK-RV32-NEXT:    li a5, 32
 ; CHECK-RV32-NEXT:    mv a3, a4
 ; CHECK-RV32-NEXT:    bltu a4, a5, .LBB47_2
 ; CHECK-RV32-NEXT:  # %bb.1:
@@ -637,19 +637,19 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV32-NEXT:    vmv1r.v v0, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v8, (a1), a2, v0.t
+; CHECK-RV32-NEXT:    addi a1, a0, 128
+; CHECK-RV32-NEXT:    addi a2, a0, 256
 ; CHECK-RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vse64.v v8, (a0)
-; CHECK-RV32-NEXT:    addi a1, a0, 128
 ; CHECK-RV32-NEXT:    vse64.v v24, (a1)
-; CHECK-RV32-NEXT:    addi a0, a0, 256
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vse64.v v16, (a0)
+; CHECK-RV32-NEXT:    vse64.v v16, (a2)
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: strided_load_v33f64:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    li a5, 32
 ; CHECK-RV64-NEXT:    vmv1r.v v8, v0
+; CHECK-RV64-NEXT:    li a5, 32
 ; CHECK-RV64-NEXT:    mv a4, a3
 ; CHECK-RV64-NEXT:    bltu a3, a5, .LBB47_2
 ; CHECK-RV64-NEXT:  # %bb.1:
@@ -687,13 +687,13 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 ; CHECK-RV64-NEXT:    vmv1r.v v0, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vlse64.v v8, (a1), a2, v0.t
+; CHECK-RV64-NEXT:    addi a1, a0, 128
+; CHECK-RV64-NEXT:    addi a2, a0, 256
 ; CHECK-RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vse64.v v8, (a0)
-; CHECK-RV64-NEXT:    addi a1, a0, 128
 ; CHECK-RV64-NEXT:    vse64.v v24, (a1)
-; CHECK-RV64-NEXT:    addi a0, a0, 256
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vse64.v v16, (a0)
+; CHECK-RV64-NEXT:    vse64.v v16, (a2)
 ; CHECK-RV64-NEXT:    ret
   %v = call <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr %ptr, i64 %stride, <33 x i1> %mask, i32 %evl)
   ret <33 x double> %v

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
index ad55f276a74c8a..12893ec55cda76 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
@@ -227,35 +227,36 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 6
+; CHECK-NEXT:    li a3, 72
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 64 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 72 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 40
-; CHECK-NEXT:    mul a2, a2, a3
+; CHECK-NEXT:    slli a2, a2, 5
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v25, v0, 8
+; CHECK-NEXT:    vslidedown.vi v6, v0, 8
 ; CHECK-NEXT:    addi a2, a1, 512
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v27, v25, 4
 ; CHECK-NEXT:    addi a3, a1, 640
+; CHECK-NEXT:    addi a4, a7, -64
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v27, v6, 4
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a3)
+; CHECK-NEXT:    sltu a3, a7, a4
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v27, 2
-; CHECK-NEXT:    addi a3, a7, -64
-; CHECK-NEXT:    sltu a4, a7, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a4, a4, a3
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a4, a3, a4
 ; CHECK-NEXT:    addi a3, a4, -32
 ; CHECK-NEXT:    sltu a5, a4, a3
 ; CHECK-NEXT:    addi a5, a5, -1
@@ -267,8 +268,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    vsetvli zero, a5, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a6, 24
-; CHECK-NEXT:    mul a5, a5, a6
+; CHECK-NEXT:    slli a5, a5, 4
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
@@ -282,23 +282,22 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a3, 16
 ; CHECK-NEXT:  .LBB16_2:
+; CHECK-NEXT:    vmv1r.v v0, v27
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a5)
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a6, 48
+; CHECK-NEXT:    li a6, 56
 ; CHECK-NEXT:    mul a5, a5, a6
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v28, v26, 2
+; CHECK-NEXT:    vslidedown.vi v27, v26, 2
 ; CHECK-NEXT:    li a5, 64
-; CHECK-NEXT:    vmv1r.v v0, v27
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    li a6, 56
-; CHECK-NEXT:    mul a3, a3, a6
+; CHECK-NEXT:    slli a3, a3, 6
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -307,12 +306,14 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    li a6, 64
 ; CHECK-NEXT:  .LBB16_4:
+; CHECK-NEXT:    vmv1r.v v0, v27
 ; CHECK-NEXT:    addi a5, a1, 384
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a1)
 ; CHECK-NEXT:    csrr t0, vlenb
-; CHECK-NEXT:    slli t0, t0, 3
+; CHECK-NEXT:    li t1, 48
+; CHECK-NEXT:    mul t0, t0, t1
 ; CHECK-NEXT:    add t0, sp, t0
 ; CHECK-NEXT:    addi t0, t0, 16
 ; CHECK-NEXT:    vs8r.v v8, (t0) # Unknown-size Folded Spill
@@ -324,9 +325,8 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    sltu t1, a6, t0
 ; CHECK-NEXT:    addi t1, t1, -1
 ; CHECK-NEXT:    and t0, t1, t0
-; CHECK-NEXT:    vmv1r.v v0, v28
 ; CHECK-NEXT:    csrr t1, vlenb
-; CHECK-NEXT:    li t2, 48
+; CHECK-NEXT:    li t2, 56
 ; CHECK-NEXT:    mul t1, t1, t2
 ; CHECK-NEXT:    add t1, sp, t1
 ; CHECK-NEXT:    addi t1, t1, 16
@@ -334,7 +334,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    vsetvli zero, t0, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
 ; CHECK-NEXT:    csrr t0, vlenb
-; CHECK-NEXT:    slli t0, t0, 4
+; CHECK-NEXT:    slli t0, t0, 3
 ; CHECK-NEXT:    add t0, sp, t0
 ; CHECK-NEXT:    addi t0, t0, 16
 ; CHECK-NEXT:    vs8r.v v8, (t0) # Unknown-size Folded Spill
@@ -342,123 +342,145 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    li a6, 16
 ; CHECK-NEXT:  .LBB16_6:
+; CHECK-NEXT:    vmv1r.v v0, v26
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a5)
 ; CHECK-NEXT:    addi a5, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a1, 256
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v27, v25, 2
-; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    vslidedown.vi v26, v6, 2
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 3
+; CHECK-NEXT:    li t0, 48
+; CHECK-NEXT:    mul a5, a5, t0
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a6, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a6, 48
+; CHECK-NEXT:    li a6, 56
 ; CHECK-NEXT:    mul a5, a5, a6
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
 ; CHECK-NEXT:    mv a5, a4
 ; CHECK-NEXT:    bltu a4, a3, .LBB16_8
 ; CHECK-NEXT:  # %bb.7:
 ; CHECK-NEXT:    li a5, 32
 ; CHECK-NEXT:  .LBB16_8:
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a1)
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a5, -16
 ; CHECK-NEXT:    sltu a5, a5, a1
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    and a1, a5, a1
-; CHECK-NEXT:    vmv1r.v v0, v27
+; CHECK-NEXT:    vmv1r.v v0, v26
 ; CHECK-NEXT:    addi a5, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v24, 0, v0.t
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a5, 40
+; CHECK-NEXT:    mul a1, a1, a5
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a4, a2, .LBB16_10
 ; CHECK-NEXT:  # %bb.9:
 ; CHECK-NEXT:    li a4, 16
 ; CHECK-NEXT:  .LBB16_10:
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v6, v7, 2
-; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vslidedown.vi v25, v7, 2
+; CHECK-NEXT:    vsetvli zero, a4, e32, m4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    li a4, 48
+; CHECK-NEXT:    mul a1, a1, a4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a4, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v24, 0, v0.t
-; CHECK-NEXT:    vmv.v.v v0, v8
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    mv a1, a7
 ; CHECK-NEXT:    bltu a7, a3, .LBB16_12
 ; CHECK-NEXT:  # %bb.11:
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:  .LBB16_12:
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 24
-; CHECK-NEXT:    mul a4, a4, a5
+; CHECK-NEXT:    slli a4, a4, 4
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv4r.v v24, v8
+; CHECK-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv4r.v v24, v16
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 56
-; CHECK-NEXT:    mul a4, a4, a5
+; CHECK-NEXT:    slli a4, a4, 3
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
 ; CHECK-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v24, 16
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 56
+; CHECK-NEXT:    li a5, 40
 ; CHECK-NEXT:    mul a4, a4, a5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 4
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv4r.v v24, v8
+; CHECK-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 48
+; CHECK-NEXT:    li a5, 40
 ; CHECK-NEXT:    mul a4, a4, a5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v24, 16
+; CHECK-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 48
-; CHECK-NEXT:    mul a4, a4, a5
+; CHECK-NEXT:    slli a4, a4, 6
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv4r.v v8, v0
-; CHECK-NEXT:    vslideup.vi v8, v16, 16
+; CHECK-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vslideup.vi v16, v24, 16
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 24
-; CHECK-NEXT:    mul a4, a4, a5
+; CHECK-NEXT:    slli a4, a4, 6
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a4, a1, -16
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    li a6, 56
+; CHECK-NEXT:    mul a5, a5, a6
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    vslideup.vi v16, v8, 16
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    li a6, 56
+; CHECK-NEXT:    mul a5, a5, a6
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    li a6, 48
+; CHECK-NEXT:    mul a5, a5, a6
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    li a6, 40
+; CHECK-NEXT:    mul a5, a5, a6
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    vslideup.vi v8, v16, 16
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    li a6, 48
+; CHECK-NEXT:    mul a5, a5, a6
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
 ; CHECK-NEXT:    sltu a1, a1, a4
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a1, a1, a4
-; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 5
+; CHECK-NEXT:    li a5, 24
+; CHECK-NEXT:    mul a4, a4, a5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
 ; CHECK-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
@@ -470,19 +492,18 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:  .LBB16_14:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a7, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v16, v24, 0, v0.t
+; CHECK-NEXT:    vnsrl.wi v24, v16, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v16, v8, 16
-; CHECK-NEXT:    vse32.v v16, (a0)
+; CHECK-NEXT:    vslideup.vi v24, v8, 16
+; CHECK-NEXT:    vse32.v v24, (a0)
 ; CHECK-NEXT:    addi a1, a0, 256
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    li a3, 48
 ; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
@@ -490,7 +511,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    vse32.v v8, (a1)
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 48
+; CHECK-NEXT:    li a3, 56
 ; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
@@ -498,14 +519,14 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    vse32.v v8, (a1)
 ; CHECK-NEXT:    addi a0, a0, 384
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 56
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 6
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 6
+; CHECK-NEXT:    li a1, 72
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
index 293b75dc207c86..db03dc3d5ab1e2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
@@ -313,10 +313,12 @@ define void @mscatter_v4i16_align1(<4 x i16> %val, <4 x ptr> %ptrs, <4 x i1> %m)
 ; RV32-SLOW-NEXT:  .LBB6_8: # %cond.store5
 ; RV32-SLOW-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV32-SLOW-NEXT:    vslidedown.vi v8, v8, 3
+; RV32-SLOW-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32-SLOW-NEXT:    vslidedown.vi v9, v9, 3
+; RV32-SLOW-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV32-SLOW-NEXT:    vmv.x.s a0, v8
 ; RV32-SLOW-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32-SLOW-NEXT:    vslidedown.vi v8, v9, 3
-; RV32-SLOW-NEXT:    vmv.x.s a1, v8
+; RV32-SLOW-NEXT:    vmv.x.s a1, v9
 ; RV32-SLOW-NEXT:    srli a2, a0, 8
 ; RV32-SLOW-NEXT:    sb a0, 0(a1)
 ; RV32-SLOW-NEXT:    sb a2, 1(a1)
@@ -376,10 +378,12 @@ define void @mscatter_v4i16_align1(<4 x i16> %val, <4 x ptr> %ptrs, <4 x i1> %m)
 ; RV64-SLOW-NEXT:  .LBB6_8: # %cond.store5
 ; RV64-SLOW-NEXT:    vsetivli zero, 1, e16, mf2, ta, ma
 ; RV64-SLOW-NEXT:    vslidedown.vi v8, v8, 3
+; RV64-SLOW-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV64-SLOW-NEXT:    vslidedown.vi v10, v10, 3
+; RV64-SLOW-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64-SLOW-NEXT:    vmv.x.s a0, v8
 ; RV64-SLOW-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; RV64-SLOW-NEXT:    vslidedown.vi v8, v10, 3
-; RV64-SLOW-NEXT:    vmv.x.s a1, v8
+; RV64-SLOW-NEXT:    vmv.x.s a1, v10
 ; RV64-SLOW-NEXT:    srli a2, a0, 8
 ; RV64-SLOW-NEXT:    sb a0, 0(a1)
 ; RV64-SLOW-NEXT:    sb a2, 1(a1)
@@ -426,9 +430,9 @@ define void @mscatter_v2i32_align2(<2 x i32> %val, <2 x ptr> %ptrs, <2 x i1> %m)
 ; RV32-SLOW-NEXT:  .LBB7_4: # %cond.store1
 ; RV32-SLOW-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; RV32-SLOW-NEXT:    vslidedown.vi v8, v8, 1
+; RV32-SLOW-NEXT:    vslidedown.vi v9, v9, 1
 ; RV32-SLOW-NEXT:    vmv.x.s a0, v8
-; RV32-SLOW-NEXT:    vslidedown.vi v8, v9, 1
-; RV32-SLOW-NEXT:    vmv.x.s a1, v8
+; RV32-SLOW-NEXT:    vmv.x.s a1, v9
 ; RV32-SLOW-NEXT:    srli a2, a0, 16
 ; RV32-SLOW-NEXT:    sh a0, 0(a1)
 ; RV32-SLOW-NEXT:    sh a2, 2(a1)
@@ -458,10 +462,12 @@ define void @mscatter_v2i32_align2(<2 x i32> %val, <2 x ptr> %ptrs, <2 x i1> %m)
 ; RV64-SLOW-NEXT:  .LBB7_4: # %cond.store1
 ; RV64-SLOW-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; RV64-SLOW-NEXT:    vslidedown.vi v8, v8, 1
+; RV64-SLOW-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV64-SLOW-NEXT:    vslidedown.vi v9, v9, 1
+; RV64-SLOW-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV64-SLOW-NEXT:    vmv.x.s a0, v8
 ; RV64-SLOW-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; RV64-SLOW-NEXT:    vslidedown.vi v8, v9, 1
-; RV64-SLOW-NEXT:    vmv.x.s a1, v8
+; RV64-SLOW-NEXT:    vmv.x.s a1, v9
 ; RV64-SLOW-NEXT:    srli a2, a0, 16
 ; RV64-SLOW-NEXT:    sh a0, 0(a1)
 ; RV64-SLOW-NEXT:    sh a2, 2(a1)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
index 4477ce73a9ffc0..fa82065f3b4131 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
@@ -301,38 +301,38 @@ define <32 x double> @vfsgnj_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v16, (a0)
 ; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    bltu a2, a1, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfsgnj.vv v8, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    addi a0, a2, -16
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfsgnj.vv v16, v16, v24, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll
index 4e9862b05f4089..dfd509062ccf7b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll
@@ -57,8 +57,8 @@ define <1 x i1> @fcmp_ogt_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -73,8 +73,8 @@ define <1 x i1> @fcmp_ogt_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -102,8 +102,8 @@ define <1 x i1> @fcmp_oge_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -118,8 +118,8 @@ define <1 x i1> @fcmp_oge_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -147,8 +147,8 @@ define <1 x i1> @fcmp_olt_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -163,8 +163,8 @@ define <1 x i1> @fcmp_olt_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -192,8 +192,8 @@ define <1 x i1> @fcmp_ole_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -208,8 +208,8 @@ define <1 x i1> @fcmp_ole_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -240,8 +240,8 @@ define <1 x i1> @fcmp_one_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -259,8 +259,8 @@ define <1 x i1> @fcmp_one_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -336,8 +336,8 @@ define <1 x i1> @fcmp_ueq_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -355,8 +355,8 @@ define <1 x i1> @fcmp_ueq_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -388,8 +388,8 @@ define <1 x i1> @fcmp_ugt_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -405,8 +405,8 @@ define <1 x i1> @fcmp_ugt_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -436,8 +436,8 @@ define <1 x i1> @fcmp_uge_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -453,8 +453,8 @@ define <1 x i1> @fcmp_uge_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -484,8 +484,8 @@ define <1 x i1> @fcmp_ult_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -501,8 +501,8 @@ define <1 x i1> @fcmp_ult_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -532,8 +532,8 @@ define <1 x i1> @fcmp_ule_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -549,8 +549,8 @@ define <1 x i1> @fcmp_ule_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -690,8 +690,8 @@ define <2 x i1> @fcmp_ogt_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -706,8 +706,8 @@ define <2 x i1> @fcmp_ogt_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -735,8 +735,8 @@ define <2 x i1> @fcmp_oge_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -751,8 +751,8 @@ define <2 x i1> @fcmp_oge_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -780,8 +780,8 @@ define <2 x i1> @fcmp_olt_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -796,8 +796,8 @@ define <2 x i1> @fcmp_olt_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -825,8 +825,8 @@ define <2 x i1> @fcmp_ole_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -841,8 +841,8 @@ define <2 x i1> @fcmp_ole_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -873,8 +873,8 @@ define <2 x i1> @fcmp_one_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -892,8 +892,8 @@ define <2 x i1> @fcmp_one_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -969,8 +969,8 @@ define <2 x i1> @fcmp_ueq_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -988,8 +988,8 @@ define <2 x i1> @fcmp_ueq_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -1021,8 +1021,8 @@ define <2 x i1> @fcmp_ugt_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1038,8 +1038,8 @@ define <2 x i1> @fcmp_ugt_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1069,8 +1069,8 @@ define <2 x i1> @fcmp_uge_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1086,8 +1086,8 @@ define <2 x i1> @fcmp_uge_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1117,8 +1117,8 @@ define <2 x i1> @fcmp_ult_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1134,8 +1134,8 @@ define <2 x i1> @fcmp_ult_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1165,8 +1165,8 @@ define <2 x i1> @fcmp_ule_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1182,8 +1182,8 @@ define <2 x i1> @fcmp_ule_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1323,8 +1323,8 @@ define <4 x i1> @fcmp_ogt_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1339,8 +1339,8 @@ define <4 x i1> @fcmp_ogt_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1368,8 +1368,8 @@ define <4 x i1> @fcmp_oge_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1384,8 +1384,8 @@ define <4 x i1> @fcmp_oge_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1413,8 +1413,8 @@ define <4 x i1> @fcmp_olt_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1429,8 +1429,8 @@ define <4 x i1> @fcmp_olt_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1458,8 +1458,8 @@ define <4 x i1> @fcmp_ole_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1474,8 +1474,8 @@ define <4 x i1> @fcmp_ole_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1506,8 +1506,8 @@ define <4 x i1> @fcmp_one_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -1525,8 +1525,8 @@ define <4 x i1> @fcmp_one_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -1602,8 +1602,8 @@ define <4 x i1> @fcmp_ueq_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -1621,8 +1621,8 @@ define <4 x i1> @fcmp_ueq_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -1654,8 +1654,8 @@ define <4 x i1> @fcmp_ugt_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1671,8 +1671,8 @@ define <4 x i1> @fcmp_ugt_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1702,8 +1702,8 @@ define <4 x i1> @fcmp_uge_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1719,8 +1719,8 @@ define <4 x i1> @fcmp_uge_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1750,8 +1750,8 @@ define <4 x i1> @fcmp_ult_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1767,8 +1767,8 @@ define <4 x i1> @fcmp_ult_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1798,8 +1798,8 @@ define <4 x i1> @fcmp_ule_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1815,8 +1815,8 @@ define <4 x i1> @fcmp_ule_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1956,8 +1956,8 @@ define <8 x i1> @fcmp_ogt_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1972,8 +1972,8 @@ define <8 x i1> @fcmp_ogt_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -2001,8 +2001,8 @@ define <8 x i1> @fcmp_oge_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -2017,8 +2017,8 @@ define <8 x i1> @fcmp_oge_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -2046,8 +2046,8 @@ define <8 x i1> @fcmp_olt_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -2062,8 +2062,8 @@ define <8 x i1> @fcmp_olt_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -2091,8 +2091,8 @@ define <8 x i1> @fcmp_ole_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -2107,8 +2107,8 @@ define <8 x i1> @fcmp_ole_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -2139,8 +2139,8 @@ define <8 x i1> @fcmp_one_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -2158,8 +2158,8 @@ define <8 x i1> @fcmp_one_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -2235,8 +2235,8 @@ define <8 x i1> @fcmp_ueq_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -2254,8 +2254,8 @@ define <8 x i1> @fcmp_ueq_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -2287,8 +2287,8 @@ define <8 x i1> @fcmp_ugt_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -2304,8 +2304,8 @@ define <8 x i1> @fcmp_ugt_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -2335,8 +2335,8 @@ define <8 x i1> @fcmp_uge_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -2352,8 +2352,8 @@ define <8 x i1> @fcmp_uge_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -2383,8 +2383,8 @@ define <8 x i1> @fcmp_ult_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -2400,8 +2400,8 @@ define <8 x i1> @fcmp_ult_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -2431,8 +2431,8 @@ define <8 x i1> @fcmp_ule_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -2448,8 +2448,8 @@ define <8 x i1> @fcmp_ule_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -2797,9 +2797,9 @@ define <16 x i1> @fcmp_one_vf_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v12, v13
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
@@ -2817,9 +2817,9 @@ define <16 x i1> @fcmp_one_fv_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v13, v12
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
@@ -2896,9 +2896,9 @@ define <16 x i1> @fcmp_ueq_vf_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v12, v13
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
@@ -2916,9 +2916,9 @@ define <16 x i1> @fcmp_ueq_fv_v16f16(<16 x half> %va, half %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v13, v12
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
@@ -3981,8 +3981,8 @@ define <1 x i1> @fcmp_ogt_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -3997,8 +3997,8 @@ define <1 x i1> @fcmp_ogt_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4026,8 +4026,8 @@ define <1 x i1> @fcmp_oge_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4042,8 +4042,8 @@ define <1 x i1> @fcmp_oge_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4071,8 +4071,8 @@ define <1 x i1> @fcmp_olt_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4087,8 +4087,8 @@ define <1 x i1> @fcmp_olt_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4116,8 +4116,8 @@ define <1 x i1> @fcmp_ole_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4132,8 +4132,8 @@ define <1 x i1> @fcmp_ole_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4164,8 +4164,8 @@ define <1 x i1> @fcmp_one_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -4183,8 +4183,8 @@ define <1 x i1> @fcmp_one_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -4260,8 +4260,8 @@ define <1 x i1> @fcmp_ueq_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -4279,8 +4279,8 @@ define <1 x i1> @fcmp_ueq_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -4312,8 +4312,8 @@ define <1 x i1> @fcmp_ugt_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4329,8 +4329,8 @@ define <1 x i1> @fcmp_ugt_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4360,8 +4360,8 @@ define <1 x i1> @fcmp_uge_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4377,8 +4377,8 @@ define <1 x i1> @fcmp_uge_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4408,8 +4408,8 @@ define <1 x i1> @fcmp_ult_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4425,8 +4425,8 @@ define <1 x i1> @fcmp_ult_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4456,8 +4456,8 @@ define <1 x i1> @fcmp_ule_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4473,8 +4473,8 @@ define <1 x i1> @fcmp_ule_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4614,8 +4614,8 @@ define <2 x i1> @fcmp_ogt_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4630,8 +4630,8 @@ define <2 x i1> @fcmp_ogt_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4659,8 +4659,8 @@ define <2 x i1> @fcmp_oge_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4675,8 +4675,8 @@ define <2 x i1> @fcmp_oge_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4704,8 +4704,8 @@ define <2 x i1> @fcmp_olt_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4720,8 +4720,8 @@ define <2 x i1> @fcmp_olt_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4749,8 +4749,8 @@ define <2 x i1> @fcmp_ole_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4765,8 +4765,8 @@ define <2 x i1> @fcmp_ole_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4797,8 +4797,8 @@ define <2 x i1> @fcmp_one_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -4816,8 +4816,8 @@ define <2 x i1> @fcmp_one_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -4893,8 +4893,8 @@ define <2 x i1> @fcmp_ueq_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -4912,8 +4912,8 @@ define <2 x i1> @fcmp_ueq_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -4945,8 +4945,8 @@ define <2 x i1> @fcmp_ugt_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4962,8 +4962,8 @@ define <2 x i1> @fcmp_ugt_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4993,8 +4993,8 @@ define <2 x i1> @fcmp_uge_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5010,8 +5010,8 @@ define <2 x i1> @fcmp_uge_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5041,8 +5041,8 @@ define <2 x i1> @fcmp_ult_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5058,8 +5058,8 @@ define <2 x i1> @fcmp_ult_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5089,8 +5089,8 @@ define <2 x i1> @fcmp_ule_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5106,8 +5106,8 @@ define <2 x i1> @fcmp_ule_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5247,8 +5247,8 @@ define <4 x i1> @fcmp_ogt_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -5263,8 +5263,8 @@ define <4 x i1> @fcmp_ogt_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -5292,8 +5292,8 @@ define <4 x i1> @fcmp_oge_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -5308,8 +5308,8 @@ define <4 x i1> @fcmp_oge_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -5337,8 +5337,8 @@ define <4 x i1> @fcmp_olt_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -5353,8 +5353,8 @@ define <4 x i1> @fcmp_olt_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -5382,8 +5382,8 @@ define <4 x i1> @fcmp_ole_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -5398,8 +5398,8 @@ define <4 x i1> @fcmp_ole_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -5430,8 +5430,8 @@ define <4 x i1> @fcmp_one_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -5449,8 +5449,8 @@ define <4 x i1> @fcmp_one_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -5526,8 +5526,8 @@ define <4 x i1> @fcmp_ueq_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -5545,8 +5545,8 @@ define <4 x i1> @fcmp_ueq_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -5578,8 +5578,8 @@ define <4 x i1> @fcmp_ugt_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5595,8 +5595,8 @@ define <4 x i1> @fcmp_ugt_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5626,8 +5626,8 @@ define <4 x i1> @fcmp_uge_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5643,8 +5643,8 @@ define <4 x i1> @fcmp_uge_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5674,8 +5674,8 @@ define <4 x i1> @fcmp_ult_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5691,8 +5691,8 @@ define <4 x i1> @fcmp_ult_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5722,8 +5722,8 @@ define <4 x i1> @fcmp_ule_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5739,8 +5739,8 @@ define <4 x i1> @fcmp_ule_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -6088,9 +6088,9 @@ define <8 x i1> @fcmp_one_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v12, v13
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
@@ -6108,9 +6108,9 @@ define <8 x i1> @fcmp_one_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v13, v12
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
@@ -6187,9 +6187,9 @@ define <8 x i1> @fcmp_ueq_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v12, v13
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
@@ -6207,9 +6207,9 @@ define <8 x i1> @fcmp_ueq_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v13, v12
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
@@ -7230,8 +7230,8 @@ define <1 x i1> @fcmp_ogt_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7246,8 +7246,8 @@ define <1 x i1> @fcmp_ogt_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7275,8 +7275,8 @@ define <1 x i1> @fcmp_oge_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7291,8 +7291,8 @@ define <1 x i1> @fcmp_oge_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7320,8 +7320,8 @@ define <1 x i1> @fcmp_olt_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7336,8 +7336,8 @@ define <1 x i1> @fcmp_olt_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7365,8 +7365,8 @@ define <1 x i1> @fcmp_ole_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7381,8 +7381,8 @@ define <1 x i1> @fcmp_ole_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7413,8 +7413,8 @@ define <1 x i1> @fcmp_one_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -7432,8 +7432,8 @@ define <1 x i1> @fcmp_one_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -7509,8 +7509,8 @@ define <1 x i1> @fcmp_ueq_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -7528,8 +7528,8 @@ define <1 x i1> @fcmp_ueq_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -7561,8 +7561,8 @@ define <1 x i1> @fcmp_ugt_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7578,8 +7578,8 @@ define <1 x i1> @fcmp_ugt_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7609,8 +7609,8 @@ define <1 x i1> @fcmp_uge_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7626,8 +7626,8 @@ define <1 x i1> @fcmp_uge_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7657,8 +7657,8 @@ define <1 x i1> @fcmp_ult_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7674,8 +7674,8 @@ define <1 x i1> @fcmp_ult_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7705,8 +7705,8 @@ define <1 x i1> @fcmp_ule_vf_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7722,8 +7722,8 @@ define <1 x i1> @fcmp_ule_fv_v1f64(<1 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7863,8 +7863,8 @@ define <2 x i1> @fcmp_ogt_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7879,8 +7879,8 @@ define <2 x i1> @fcmp_ogt_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7908,8 +7908,8 @@ define <2 x i1> @fcmp_oge_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7924,8 +7924,8 @@ define <2 x i1> @fcmp_oge_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7953,8 +7953,8 @@ define <2 x i1> @fcmp_olt_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7969,8 +7969,8 @@ define <2 x i1> @fcmp_olt_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7998,8 +7998,8 @@ define <2 x i1> @fcmp_ole_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -8014,8 +8014,8 @@ define <2 x i1> @fcmp_ole_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -8046,8 +8046,8 @@ define <2 x i1> @fcmp_one_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -8065,8 +8065,8 @@ define <2 x i1> @fcmp_one_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -8142,8 +8142,8 @@ define <2 x i1> @fcmp_ueq_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -8161,8 +8161,8 @@ define <2 x i1> @fcmp_ueq_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -8194,8 +8194,8 @@ define <2 x i1> @fcmp_ugt_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -8211,8 +8211,8 @@ define <2 x i1> @fcmp_ugt_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -8242,8 +8242,8 @@ define <2 x i1> @fcmp_uge_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -8259,8 +8259,8 @@ define <2 x i1> @fcmp_uge_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -8290,8 +8290,8 @@ define <2 x i1> @fcmp_ult_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -8307,8 +8307,8 @@ define <2 x i1> @fcmp_ult_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -8338,8 +8338,8 @@ define <2 x i1> @fcmp_ule_vf_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -8355,8 +8355,8 @@ define <2 x i1> @fcmp_ule_fv_v2f64(<2 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -8704,9 +8704,9 @@ define <4 x i1> @fcmp_one_vf_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v12, v13
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
@@ -8724,9 +8724,9 @@ define <4 x i1> @fcmp_one_fv_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v13, v12
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
@@ -8803,9 +8803,9 @@ define <4 x i1> @fcmp_ueq_vf_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v12, v13
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
@@ -8823,9 +8823,9 @@ define <4 x i1> @fcmp_ueq_fv_v4f64(<4 x double> %va, double %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v13, v12
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll
index 97641ff6d92d70..472f2073667dbe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll
@@ -509,8 +509,8 @@ define <1 x i1> @fcmps_uno_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -1041,8 +1041,8 @@ define <2 x i1> @fcmps_uno_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -1573,8 +1573,8 @@ define <4 x i1> @fcmps_uno_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -2105,8 +2105,8 @@ define <8 x i1> @fcmps_uno_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -2637,10 +2637,10 @@ define <16 x i1> @fcmps_uno_vf_v16f16(<16 x half> %va, half %b) nounwind strictf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfle.vf v12, v10, fa0
-; CHECK-NEXT:    vmfle.vv v10, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v10
-; CHECK-NEXT:    vmorn.mm v0, v8, v12
+; CHECK-NEXT:    vmfle.vv v12, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v10, fa0
+; CHECK-NEXT:    vmnot.m v9, v12
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x half> poison, half %b, i32 0
   %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -3210,10 +3210,10 @@ define <32 x i1> @fcmps_uno_vf_v32f16(<32 x half> %va, half %b) nounwind strictf
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
-; CHECK-NEXT:    vmfle.vf v16, v12, fa0
-; CHECK-NEXT:    vmfle.vv v12, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v12
-; CHECK-NEXT:    vmorn.mm v0, v8, v16
+; CHECK-NEXT:    vmfle.vv v16, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v12, fa0
+; CHECK-NEXT:    vmnot.m v9, v16
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <32 x half> poison, half %b, i32 0
   %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer
@@ -3743,8 +3743,8 @@ define <1 x i1> @fcmps_uno_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -4275,8 +4275,8 @@ define <2 x i1> @fcmps_uno_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -4807,8 +4807,8 @@ define <4 x i1> @fcmps_uno_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -5339,10 +5339,10 @@ define <8 x i1> @fcmps_uno_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfle.vf v12, v10, fa0
-; CHECK-NEXT:    vmfle.vv v10, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v10
-; CHECK-NEXT:    vmorn.mm v0, v8, v12
+; CHECK-NEXT:    vmfle.vv v12, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v10, fa0
+; CHECK-NEXT:    vmnot.m v9, v12
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x float> poison, float %b, i32 0
   %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer
@@ -5871,10 +5871,10 @@ define <16 x i1> @fcmps_uno_vf_v16f32(<16 x float> %va, float %b) nounwind stric
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
-; CHECK-NEXT:    vmfle.vf v16, v12, fa0
-; CHECK-NEXT:    vmfle.vv v12, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v12
-; CHECK-NEXT:    vmorn.mm v0, v8, v16
+; CHECK-NEXT:    vmfle.vv v16, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v12, fa0
+; CHECK-NEXT:    vmnot.m v9, v16
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <16 x float> poison, float %b, i32 0
   %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer
@@ -6403,8 +6403,8 @@ define <1 x i1> @fcmps_uno_vf_v1f64(<1 x double> %va, double %b) nounwind strict
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -6935,8 +6935,8 @@ define <2 x i1> @fcmps_uno_vf_v2f64(<2 x double> %va, double %b) nounwind strict
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -7467,10 +7467,10 @@ define <4 x i1> @fcmps_uno_vf_v4f64(<4 x double> %va, double %b) nounwind strict
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfle.vf v12, v10, fa0
-; CHECK-NEXT:    vmfle.vv v10, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v10
-; CHECK-NEXT:    vmorn.mm v0, v8, v12
+; CHECK-NEXT:    vmfle.vv v12, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v10, fa0
+; CHECK-NEXT:    vmnot.m v9, v12
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <4 x double> poison, double %b, i32 0
   %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer
@@ -7999,10 +7999,10 @@ define <8 x i1> @fcmps_uno_vf_v8f64(<8 x double> %va, double %b) nounwind strict
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
-; CHECK-NEXT:    vmfle.vf v16, v12, fa0
-; CHECK-NEXT:    vmfle.vv v12, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v12
-; CHECK-NEXT:    vmorn.mm v0, v8, v16
+; CHECK-NEXT:    vmfle.vv v16, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v12, fa0
+; CHECK-NEXT:    vmnot.m v9, v16
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <8 x double> poison, double %b, i32 0
   %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
index f03f1ec639eb65..c61f9cd9b5bd77 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfma-vp.ll
@@ -66,12 +66,12 @@ define <2 x half> @vfma_vf_v2f16(<2 x half> %va, half %b, <2 x half> %vc, <2 x i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -92,12 +92,12 @@ define <2 x half> @vfma_vf_v2f16_unmasked(<2 x half> %va, half %b, <2 x half> %v
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
 ; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -165,12 +165,12 @@ define <4 x half> @vfma_vf_v4f16(<4 x half> %va, half %b, <4 x half> %vc, <4 x i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -191,12 +191,12 @@ define <4 x half> @vfma_vf_v4f16_unmasked(<4 x half> %va, half %b, <4 x half> %v
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
 ; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -264,14 +264,14 @@ define <8 x half> @vfma_vf_v8f16(<8 x half> %va, half %b, <8 x half> %vc, <8 x i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -290,14 +290,14 @@ define <8 x half> @vfma_vf_v8f16_unmasked(<8 x half> %va, half %b, <8 x half> %v
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <8 x half> poison, half %b, i32 0
   %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer
@@ -363,14 +363,14 @@ define <16 x half> @vfma_vf_v16f16(<16 x half> %va, half %b, <16 x half> %vc, <1
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <16 x half> poison, half %b, i32 0
   %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -389,14 +389,14 @@ define <16 x half> @vfma_vf_v16f16_unmasked(<16 x half> %va, half %b, <16 x half
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <16 x half> poison, half %b, i32 0
   %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer
@@ -822,31 +822,31 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    addi a1, a2, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    li a3, 24
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a1, a2, 128
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v24, (a2)
+; CHECK-NEXT:    addi a2, a0, 128
+; CHECK-NEXT:    vle64.v v8, (a1)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v8, (a2)
 ; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    mv a0, a4
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
@@ -855,12 +855,18 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB50_2:
-; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v24, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a0, a4, -16
 ; CHECK-NEXT:    sltu a1, a4, a0
 ; CHECK-NEXT:    addi a1, a1, -1
@@ -871,20 +877,21 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    vmv.v.v v16, v8
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 5
@@ -907,24 +914,24 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    addi a1, a2, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vle64.v v24, (a1)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a2)
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a1, a2, 128
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v16, (a2)
+; CHECK-NEXT:    addi a2, a0, 128
+; CHECK-NEXT:    vle64.v v8, (a1)
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v24, (a2)
 ; CHECK-NEXT:    vle64.v v0, (a0)
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    mv a0, a4
@@ -936,9 +943,9 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v0, v8, v24
+; CHECK-NEXT:    vfmadd.vv v0, v8, v16
 ; CHECK-NEXT:    addi a0, a4, -16
 ; CHECK-NEXT:    sltu a1, a4, a0
 ; CHECK-NEXT:    addi a1, a1, -1
@@ -947,9 +954,9 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v24, v16, v8
 ; CHECK-NEXT:    vmv8r.v v8, v0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
index 3be992b7e0c3b5..cad7adbc19f3c8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
@@ -393,38 +393,38 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v16, (a0)
 ; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    bltu a2, a1, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmax.vv v8, v8, v24, v0.t
+; CHECK-NEXT:    vfmax.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    addi a0, a2, -16
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v16, v16, v24, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
index 49a774c29e432e..d8ee7a7044b49c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
@@ -393,38 +393,38 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v16, (a1)
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v16, (a0)
 ; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    bltu a2, a1, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmin.vv v8, v8, v24, v0.t
+; CHECK-NEXT:    vfmin.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    addi a0, a2, -16
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v16, v16, v24, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
index e2189b55c8bf2c..a5d9b3439e29bd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmuladd-vp.ll
@@ -610,31 +610,31 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    addi a1, a2, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    li a3, 24
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a1, a2, 128
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v24, (a2)
+; CHECK-NEXT:    addi a2, a0, 128
+; CHECK-NEXT:    vle64.v v8, (a1)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v8, (a2)
 ; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    mv a0, a4
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
@@ -643,12 +643,18 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB50_2:
-; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v24, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a0, a4, -16
 ; CHECK-NEXT:    sltu a1, a4, a0
 ; CHECK-NEXT:    addi a1, a1, -1
@@ -659,20 +665,21 @@ define <32 x double> @vfma_vv_v32f64(<32 x double> %va, <32 x double> %b, <32 x
 ; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    vmv.v.v v16, v8
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 5
@@ -695,24 +702,24 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    addi a1, a2, 128
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a1, a0, 128
-; CHECK-NEXT:    vle64.v v24, (a1)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a2)
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a1, a2, 128
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v16, (a2)
+; CHECK-NEXT:    addi a2, a0, 128
+; CHECK-NEXT:    vle64.v v8, (a1)
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v24, (a2)
 ; CHECK-NEXT:    vle64.v v0, (a0)
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    mv a0, a4
@@ -724,9 +731,9 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v0, v8, v24
+; CHECK-NEXT:    vfmadd.vv v0, v8, v16
 ; CHECK-NEXT:    addi a0, a4, -16
 ; CHECK-NEXT:    sltu a1, a4, a0
 ; CHECK-NEXT:    addi a1, a1, -1
@@ -735,9 +742,9 @@ define <32 x double> @vfma_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v24, v16, v8
 ; CHECK-NEXT:    vmv8r.v v8, v0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
index 1144f776e7fbfc..aba9056c78cdaa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll
@@ -118,11 +118,11 @@ define void @vfwmacc_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a
 ; FOLDING:       # %bb.0:
 ; FOLDING-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; FOLDING-NEXT:    vfwmul.vv v12, v8, v9
+; FOLDING-NEXT:    vfwsub.vv v13, v9, v10
 ; FOLDING-NEXT:    vfwmacc.vv v11, v8, v10
-; FOLDING-NEXT:    vfwsub.vv v8, v9, v10
 ; FOLDING-NEXT:    vse64.v v12, (a0)
 ; FOLDING-NEXT:    vse64.v v11, (a1)
-; FOLDING-NEXT:    vse64.v v8, (a2)
+; FOLDING-NEXT:    vse64.v v13, (a2)
 ; FOLDING-NEXT:    ret
   %c = fpext <2 x float> %a to <2 x double>
   %d = fpext <2 x float> %b to <2 x double>

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
index 88ae643ca742e7..24e75cde2ce915 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
@@ -296,15 +296,15 @@ define <32 x i8> @vpgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %
 ; RV64-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
 ; RV64-NEXT:    vluxei64.v v10, (a0), v16, v0.t
 ; RV64-NEXT:    addi a2, a1, -16
-; RV64-NEXT:    sltu a1, a1, a2
-; RV64-NEXT:    addi a1, a1, -1
-; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 16
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
+; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf8 v16, v8
+; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a1, a1, a2
 ; RV64-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; RV64-NEXT:    vluxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    li a0, 32
@@ -2052,11 +2052,11 @@ define <32 x double> @vpgather_v32f64(<32 x ptr> %ptrs, <32 x i1> %m, i32 zeroex
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v24, (zero), v8, v0.t
 ; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 16
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -2103,11 +2103,11 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs,
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2119,10 +2119,10 @@ define <32 x double> @vpgather_baseidx_v32i8_v32f64(ptr %base, <32 x i8> %idxs,
 ; RV64-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v10, v8, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf8 v16, v10
-; RV64-NEXT:    vsll.vi v16, v16, 3
 ; RV64-NEXT:    vsext.vf8 v24, v8
 ; RV64-NEXT:    li a3, 16
+; RV64-NEXT:    vsext.vf8 v16, v10
+; RV64-NEXT:    vsll.vi v16, v16, 3
 ; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB95_2
@@ -2161,11 +2161,11 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2178,10 +2178,10 @@ define <32 x double> @vpgather_baseidx_sext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV64-NEXT:    vsext.vf8 v24, v8
 ; RV64-NEXT:    vsetivli zero, 16, e8, m2, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 16
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf8 v16, v8
 ; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB96_2
@@ -2221,11 +2221,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei16.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2247,11 +2247,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei16.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    addi a2, a1, -16
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    and a1, a1, a2
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v16, 16
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2279,11 +2279,11 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2295,10 +2295,10 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs
 ; RV64-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v12, v8, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf4 v16, v12
-; RV64-NEXT:    vsll.vi v16, v16, 3
 ; RV64-NEXT:    vsext.vf4 v24, v8
 ; RV64-NEXT:    li a3, 16
+; RV64-NEXT:    vsext.vf4 v16, v12
+; RV64-NEXT:    vsll.vi v16, v16, 3
 ; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB98_2
@@ -2337,11 +2337,11 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2354,10 +2354,10 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV64-NEXT:    vsext.vf4 v24, v8
 ; RV64-NEXT:    vsetivli zero, 16, e16, m4, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 16
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf4 v16, v8
 ; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB99_2
@@ -2397,11 +2397,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2423,11 +2423,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16>
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    addi a2, a1, -16
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    sltu a1, a1, a2
 ; RV64-NEXT:    addi a1, a1, -1
 ; RV64-NEXT:    and a1, a1, a2
-; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v24, v16, 16
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2454,11 +2454,11 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2470,10 +2470,10 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v16, v8, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v24, v16
-; RV64-NEXT:    vsll.vi v16, v24, 3
 ; RV64-NEXT:    vsext.vf2 v24, v8
 ; RV64-NEXT:    li a3, 16
+; RV64-NEXT:    vsext.vf2 v8, v16
+; RV64-NEXT:    vsll.vi v16, v8, 3
 ; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB101_2
@@ -2511,11 +2511,11 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2528,10 +2528,10 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV64-NEXT:    vsext.vf2 v24, v8
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 16
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf2 v16, v8
 ; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB102_2
@@ -2570,11 +2570,11 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v8, (a0), v16, v0.t
 ; RV32-NEXT:    addi a2, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2587,10 +2587,10 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
 ; RV64-NEXT:    vzext.vf2 v24, v8
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v8, v8, 16
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vzext.vf2 v16, v8
 ; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsll.vi v8, v24, 3
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:    bltu a1, a3, .LBB103_2
@@ -2622,17 +2622,17 @@ define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x
 ; RV32-NEXT:    vnsrl.wi v24, v16, 0
 ; RV32-NEXT:    vnsrl.wi v16, v8, 0
 ; RV32-NEXT:    li a2, 32
+; RV32-NEXT:    addi a3, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vslideup.vi v16, v24, 16
 ; RV32-NEXT:    vsll.vi v24, v16, 3
+; RV32-NEXT:    sltu a2, a1, a3
+; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
-; RV32-NEXT:    addi a2, a1, -16
-; RV32-NEXT:    sltu a3, a1, a2
-; RV32-NEXT:    addi a3, a3, -1
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
-; RV32-NEXT:    and a2, a3, a2
+; RV32-NEXT:    and a2, a2, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (a0), v8, v0.t
 ; RV32-NEXT:    li a2, 16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
index 351d7d4cd9b09b..71f497e4c7be48 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll
@@ -374,11 +374,11 @@ define <32 x double> @vpload_v32f64(ptr %ptr, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a2, a1, -16
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v0, 2
 ; CHECK-NEXT:    sltu a1, a1, a2
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a1, a1, a2
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 2
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a0), v0.t
@@ -394,19 +394,19 @@ declare <33 x double> @llvm.vp.load.v33f64.p0(ptr, <33 x i1>, i32)
 define <33 x double> @vpload_v33f64(ptr %ptr, <33 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpload_v33f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a4, 32
 ; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    li a4, 32
 ; CHECK-NEXT:    mv a3, a2
 ; CHECK-NEXT:    bltu a2, a4, .LBB32_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    addi a4, a3, -16
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v8, 2
 ; CHECK-NEXT:    sltu a3, a3, a4
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a3, a3, a4
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v8, 2
 ; CHECK-NEXT:    addi a4, a1, 128
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a4), v0.t
@@ -431,13 +431,13 @@ define <33 x double> @vpload_v33f64(ptr %ptr, <33 x i1> %m, i32 zeroext %evl) {
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a1), v0.t
+; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    addi a2, a0, 256
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a0)
-; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vse64.v v16, (a1)
-; CHECK-NEXT:    addi a0, a0, 256
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vse64.v v24, (a0)
+; CHECK-NEXT:    vse64.v v24, (a2)
 ; CHECK-NEXT:    ret
   %load = call <33 x double> @llvm.vp.load.v33f64.p0(ptr %ptr, <33 x i1> %m, i32 %evl)
   ret <33 x double> %load

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
index bc42b42208bc3d..a11c2b6bca12ec 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
@@ -1176,46 +1176,37 @@ define <32 x double> @vpmerge_vv_v32f64(<32 x double> %va, <32 x double> %vb, <3
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v24, (a1)
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a0)
+; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    bltu a2, a1, .LBB79_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB79_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, ma
-; CHECK-NEXT:    vmerge.vvm v24, v24, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    addi a0, a2, -16
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
 ; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, ma
-; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT:    vmv8r.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
index 9e37780f56e1c5..d691dcd5c54b6f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll
@@ -1756,11 +1756,11 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (zero), v24, v0.t
 ; RV32-NEXT:    addi a0, a1, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a1, a1, a0
 ; RV32-NEXT:    addi a1, a1, -1
 ; RV32-NEXT:    and a0, a1, a0
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
@@ -1775,11 +1775,11 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m
 ; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    addi a1, a0, 128
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vle64.v v24, (a1)
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vle64.v v16, (a1)
 ; RV64-NEXT:    vle64.v v24, (a0)
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:    mv a0, a2
@@ -1798,7 +1798,7 @@ define void @vpscatter_v32f64(<32 x double> %val, <32 x ptr> %ptrs, <32 x i1> %m
 ; RV64-NEXT:    addi a1, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v16, (zero), v8, v0.t
+; RV64-NEXT:    vsoxei64.v v8, (zero), v16, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 3
 ; RV64-NEXT:    add sp, sp, a0
@@ -1826,11 +1826,11 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    addi a1, a2, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a2, a2, a1
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a1, a2, a1
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -1842,44 +1842,51 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a3, a3, 3
+; RV64-NEXT:    slli a4, a3, 3
+; RV64-NEXT:    add a3, a4, a3
 ; RV64-NEXT:    sub sp, sp, a3
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vs1r.v v0, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a3, 32
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV64-NEXT:    vle32.v v24, (a1)
-; RV64-NEXT:    vmv1r.v v7, v0
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV64-NEXT:    vslidedown.vi v16, v24, 16
-; RV64-NEXT:    vmv4r.v v0, v24
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf2 v0, v24
 ; RV64-NEXT:    vsext.vf2 v24, v16
 ; RV64-NEXT:    vsll.vi v16, v24, 3
-; RV64-NEXT:    vsext.vf2 v24, v0
-; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsll.vi v24, v24, 3
+; RV64-NEXT:    vsll.vi v24, v0, 3
 ; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:    bltu a2, a3, .LBB84_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:  .LBB84_2:
-; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    addi a1, a2, -16
 ; RV64-NEXT:    sltu a2, a2, a1
 ; RV64-NEXT:    addi a2, a2, -1
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v0, v7, 2
+; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    and a1, a2, a1
-; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    slli a1, a0, 3
+; RV64-NEXT:    add a0, a1, a0
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
 ; RV64-NEXT:    addi sp, sp, 16
@@ -1906,11 +1913,11 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    addi a1, a2, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a2, a2, a1
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a1, a2, a1
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -1922,37 +1929,37 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a4, a3, 3
-; RV64-NEXT:    add a3, a4, a3
+; RV64-NEXT:    slli a3, a3, 4
 ; RV64-NEXT:    sub sp, sp, a3
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 3
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a3, 32
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV64-NEXT:    vle32.v v24, (a1)
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v0, v24
+; RV64-NEXT:    vsext.vf2 v16, v24
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v24, 16
+; RV64-NEXT:    vslidedown.vi v8, v24, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf2 v16, v24
-; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsll.vi v24, v0, 3
+; RV64-NEXT:    vsext.vf2 v24, v8
+; RV64-NEXT:    vsll.vi v8, v24, 3
+; RV64-NEXT:    vsll.vi v24, v16, 3
 ; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:    bltu a2, a3, .LBB85_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:  .LBB85_2:
 ; RV64-NEXT:    addi a3, sp, 16
-; RV64-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
+; RV64-NEXT:    vsoxei64.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    addi a1, a2, -16
 ; RV64-NEXT:    sltu a2, a2, a1
 ; RV64-NEXT:    addi a2, a2, -1
@@ -1960,14 +1967,14 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    vsoxei64.v v16, (a0), v8, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a1, a0, 3
-; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    slli a0, a0, 4
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
 ; RV64-NEXT:    addi sp, sp, 16
@@ -1995,11 +2002,11 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    addi a1, a2, -16
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    sltu a2, a2, a1
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a1, a2, a1
-; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v0, v0, 2
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v24, 16
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
@@ -2011,37 +2018,37 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a4, a3, 3
-; RV64-NEXT:    add a3, a4, a3
+; RV64-NEXT:    slli a3, a3, 4
 ; RV64-NEXT:    sub sp, sp, a3
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 3
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a3, 32
 ; RV64-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; RV64-NEXT:    vle32.v v24, (a1)
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    li a3, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v0, v24
+; RV64-NEXT:    vzext.vf2 v16, v24
 ; RV64-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v24, 16
+; RV64-NEXT:    vslidedown.vi v8, v24, 16
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v16, v24
-; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    li a3, 16
-; RV64-NEXT:    vsll.vi v24, v0, 3
+; RV64-NEXT:    vzext.vf2 v24, v8
+; RV64-NEXT:    vsll.vi v8, v24, 3
+; RV64-NEXT:    vsll.vi v24, v16, 3
 ; RV64-NEXT:    mv a1, a2
 ; RV64-NEXT:    bltu a2, a3, .LBB86_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:  .LBB86_2:
 ; RV64-NEXT:    addi a3, sp, 16
-; RV64-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
+; RV64-NEXT:    vsoxei64.v v16, (a0), v24, v0.t
 ; RV64-NEXT:    addi a1, a2, -16
 ; RV64-NEXT:    sltu a2, a2, a1
 ; RV64-NEXT:    addi a2, a2, -1
@@ -2049,14 +2056,14 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base,
 ; RV64-NEXT:    vslidedown.vi v0, v0, 2
 ; RV64-NEXT:    and a1, a2, a1
 ; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; RV64-NEXT:    vsoxei64.v v16, (a0), v8, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a1, a0, 3
-; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    slli a0, a0, 4
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
 ; RV64-NEXT:    addi sp, sp, 16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
index d34292abdce0d5..8eaa5efe163cdd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
@@ -292,11 +292,11 @@ define void @vpstore_v32f64(<32 x double> %val, ptr %ptr, <32 x i1> %m, i32 zero
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a0), v0.t
 ; CHECK-NEXT:    addi a2, a1, -16
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v0, v0, 2
 ; CHECK-NEXT:    sltu a1, a1, a2
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a1, a1, a2
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v0, 2
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v16, (a0), v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll
index 418b159c8fb98d..5975b0d0761ebb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll
@@ -11,8 +11,8 @@ define <1 x i8> @vrol_vv_v1i8(<1 x i8> %a, <1 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -33,8 +33,8 @@ define <1 x i8> @vrol_vx_v1i8(<1 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -58,8 +58,8 @@ define <2 x i8> @vrol_vv_v2i8(<2 x i8> %a, <2 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -80,8 +80,8 @@ define <2 x i8> @vrol_vx_v2i8(<2 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -105,8 +105,8 @@ define <4 x i8> @vrol_vv_v4i8(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -127,8 +127,8 @@ define <4 x i8> @vrol_vx_v4i8(<4 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -152,8 +152,8 @@ define <8 x i8> @vrol_vv_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -174,8 +174,8 @@ define <8 x i8> @vrol_vx_v8i8(<8 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -199,8 +199,8 @@ define <16 x i8> @vrol_vv_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -221,8 +221,8 @@ define <16 x i8> @vrol_vx_v16i8(<16 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -247,8 +247,8 @@ define <32 x i8> @vrol_vv_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vand.vi v12, v10, 7
-; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -271,8 +271,8 @@ define <32 x i8> @vrol_vx_v32i8(<32 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vand.vi v12, v10, 7
-; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -298,8 +298,8 @@ define <64 x i8> @vrol_vv_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; CHECK-NEXT:    li a0, 64
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vand.vi v16, v12, 7
-; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -322,8 +322,8 @@ define <64 x i8> @vrol_vx_v64i8(<64 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vand.vi v16, v12, 7
-; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -348,8 +348,8 @@ define <1 x i16> @vrol_vv_v1i16(<1 x i16> %a, <1 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -370,8 +370,8 @@ define <1 x i16> @vrol_vx_v1i16(<1 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -395,8 +395,8 @@ define <2 x i16> @vrol_vv_v2i16(<2 x i16> %a, <2 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -417,8 +417,8 @@ define <2 x i16> @vrol_vx_v2i16(<2 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -442,8 +442,8 @@ define <4 x i16> @vrol_vv_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -464,8 +464,8 @@ define <4 x i16> @vrol_vx_v4i16(<4 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -489,8 +489,8 @@ define <8 x i16> @vrol_vv_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -511,8 +511,8 @@ define <8 x i16> @vrol_vx_v8i16(<8 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -536,8 +536,8 @@ define <16 x i16> @vrol_vv_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vand.vi v12, v10, 15
-; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -558,8 +558,8 @@ define <16 x i16> @vrol_vx_v16i16(<16 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vand.vi v12, v10, 15
-; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -584,8 +584,8 @@ define <32 x i16> @vrol_vv_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vand.vi v16, v12, 15
-; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -608,8 +608,8 @@ define <32 x i16> @vrol_vx_v32i16(<32 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vand.vi v16, v12, 15
-; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -634,12 +634,12 @@ define <1 x i32> @vrol_vv_v1i32(<1 x i32> %a, <1 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vv v9, v8, v9
+; CHECK-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_v1i32:
@@ -658,8 +658,8 @@ define <1 x i32> @vrol_vx_v1i32(<1 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -683,12 +683,12 @@ define <2 x i32> @vrol_vv_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vv v9, v8, v9
+; CHECK-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_v2i32:
@@ -707,8 +707,8 @@ define <2 x i32> @vrol_vx_v2i32(<2 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -732,12 +732,12 @@ define <4 x i32> @vrol_vv_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vv v9, v8, v9
+; CHECK-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_v4i32:
@@ -756,8 +756,8 @@ define <4 x i32> @vrol_vx_v4i32(<4 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -781,12 +781,12 @@ define <8 x i32> @vrol_vv_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vand.vx v12, v10, a0
-; CHECK-NEXT:    vsll.vv v12, v8, v12
-; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vrsub.vi v12, v10, 0
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
+; CHECK-NEXT:    vsrl.vv v8, v8, v12
+; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_v8i32:
@@ -805,8 +805,8 @@ define <8 x i32> @vrol_vx_v8i32(<8 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v12, v10, a0
-; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -830,12 +830,12 @@ define <16 x i32> @vrol_vv_v16i32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vand.vx v16, v12, a0
-; CHECK-NEXT:    vsll.vv v16, v8, v16
-; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vrsub.vi v16, v12, 0
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v12
-; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
+; CHECK-NEXT:    vsrl.vv v8, v8, v16
+; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_v16i32:
@@ -854,8 +854,8 @@ define <16 x i32> @vrol_vx_v16i32(<16 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v16, v12, a0
-; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -879,12 +879,12 @@ define <1 x i64> @vrol_vv_v1i64(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vv v9, v8, v9
+; CHECK-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_v1i64:
@@ -903,8 +903,8 @@ define <1 x i64> @vrol_vx_v1i64(<1 x i64> %a, i64 %b) {
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -928,12 +928,12 @@ define <2 x i64> @vrol_vv_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vv v9, v8, v9
+; CHECK-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_v2i64:
@@ -955,9 +955,9 @@ define <2 x i64> @vrol_vx_v2i64(<2 x i64> %a, i64 %b) {
 ; RV32-NEXT:    vwsub.vx v11, v10, a0
 ; RV32-NEXT:    li a0, 63
 ; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; RV32-NEXT:    vand.vx v9, v9, a0
 ; RV32-NEXT:    vand.vx v10, v11, a0
 ; RV32-NEXT:    vsrl.vv v10, v8, v10
-; RV32-NEXT:    vand.vx v9, v9, a0
 ; RV32-NEXT:    vsll.vv v8, v8, v9
 ; RV32-NEXT:    vor.vv v8, v8, v10
 ; RV32-NEXT:    ret
@@ -968,8 +968,8 @@ define <2 x i64> @vrol_vx_v2i64(<2 x i64> %a, i64 %b) {
 ; RV64-NEXT:    vmv.v.x v9, a0
 ; RV64-NEXT:    li a0, 63
 ; RV64-NEXT:    vand.vx v10, v9, a0
-; RV64-NEXT:    vsll.vv v10, v8, v10
 ; RV64-NEXT:    vrsub.vi v9, v9, 0
+; RV64-NEXT:    vsll.vv v10, v8, v10
 ; RV64-NEXT:    vand.vx v9, v9, a0
 ; RV64-NEXT:    vsrl.vv v8, v8, v9
 ; RV64-NEXT:    vor.vv v8, v10, v8
@@ -993,12 +993,12 @@ define <4 x i64> @vrol_vv_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vand.vx v12, v10, a0
-; CHECK-NEXT:    vsll.vv v12, v8, v12
-; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vrsub.vi v12, v10, 0
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
+; CHECK-NEXT:    vsrl.vv v8, v8, v12
+; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_v4i64:
@@ -1020,9 +1020,9 @@ define <4 x i64> @vrol_vx_v4i64(<4 x i64> %a, i64 %b) {
 ; RV32-NEXT:    vwsub.vx v14, v12, a0
 ; RV32-NEXT:    li a0, 63
 ; RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; RV32-NEXT:    vand.vx v10, v10, a0
 ; RV32-NEXT:    vand.vx v12, v14, a0
 ; RV32-NEXT:    vsrl.vv v12, v8, v12
-; RV32-NEXT:    vand.vx v10, v10, a0
 ; RV32-NEXT:    vsll.vv v8, v8, v10
 ; RV32-NEXT:    vor.vv v8, v8, v12
 ; RV32-NEXT:    ret
@@ -1033,8 +1033,8 @@ define <4 x i64> @vrol_vx_v4i64(<4 x i64> %a, i64 %b) {
 ; RV64-NEXT:    vmv.v.x v10, a0
 ; RV64-NEXT:    li a0, 63
 ; RV64-NEXT:    vand.vx v12, v10, a0
-; RV64-NEXT:    vsll.vv v12, v8, v12
 ; RV64-NEXT:    vrsub.vi v10, v10, 0
+; RV64-NEXT:    vsll.vv v12, v8, v12
 ; RV64-NEXT:    vand.vx v10, v10, a0
 ; RV64-NEXT:    vsrl.vv v8, v8, v10
 ; RV64-NEXT:    vor.vv v8, v12, v8
@@ -1058,12 +1058,12 @@ define <8 x i64> @vrol_vv_v8i64(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vand.vx v16, v12, a0
-; CHECK-NEXT:    vsll.vv v16, v8, v16
-; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vrsub.vi v16, v12, 0
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v12
-; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
+; CHECK-NEXT:    vsrl.vv v8, v8, v16
+; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_v8i64:
@@ -1085,9 +1085,9 @@ define <8 x i64> @vrol_vx_v8i64(<8 x i64> %a, i64 %b) {
 ; RV32-NEXT:    vwsub.vx v20, v16, a0
 ; RV32-NEXT:    li a0, 63
 ; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; RV32-NEXT:    vand.vx v12, v12, a0
 ; RV32-NEXT:    vand.vx v16, v20, a0
 ; RV32-NEXT:    vsrl.vv v16, v8, v16
-; RV32-NEXT:    vand.vx v12, v12, a0
 ; RV32-NEXT:    vsll.vv v8, v8, v12
 ; RV32-NEXT:    vor.vv v8, v8, v16
 ; RV32-NEXT:    ret
@@ -1098,8 +1098,8 @@ define <8 x i64> @vrol_vx_v8i64(<8 x i64> %a, i64 %b) {
 ; RV64-NEXT:    vmv.v.x v12, a0
 ; RV64-NEXT:    li a0, 63
 ; RV64-NEXT:    vand.vx v16, v12, a0
-; RV64-NEXT:    vsll.vv v16, v8, v16
 ; RV64-NEXT:    vrsub.vi v12, v12, 0
+; RV64-NEXT:    vsll.vv v16, v8, v16
 ; RV64-NEXT:    vand.vx v12, v12, a0
 ; RV64-NEXT:    vsrl.vv v8, v8, v12
 ; RV64-NEXT:    vor.vv v8, v16, v8

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll
index e4ddfeb4c4195a..68a9e217ccd1ca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll
@@ -12,8 +12,8 @@ define <1 x i8> @vror_vv_v1i8(<1 x i8> %a, <1 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -34,8 +34,8 @@ define <1 x i8> @vror_vx_v1i8(<1 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -96,8 +96,8 @@ define <2 x i8> @vror_vv_v2i8(<2 x i8> %a, <2 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -118,8 +118,8 @@ define <2 x i8> @vror_vx_v2i8(<2 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -180,8 +180,8 @@ define <4 x i8> @vror_vv_v4i8(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -202,8 +202,8 @@ define <4 x i8> @vror_vx_v4i8(<4 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -264,8 +264,8 @@ define <8 x i8> @vror_vv_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -286,8 +286,8 @@ define <8 x i8> @vror_vx_v8i8(<8 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -348,8 +348,8 @@ define <16 x i8> @vror_vv_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -370,8 +370,8 @@ define <16 x i8> @vror_vx_v16i8(<16 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -433,8 +433,8 @@ define <32 x i8> @vror_vv_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vand.vi v12, v10, 7
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -457,8 +457,8 @@ define <32 x i8> @vror_vx_v32i8(<32 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vand.vi v12, v10, 7
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -525,8 +525,8 @@ define <64 x i8> @vror_vv_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; CHECK-NEXT:    li a0, 64
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vand.vi v16, v12, 7
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -549,8 +549,8 @@ define <64 x i8> @vror_vx_v64i8(<64 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vand.vi v16, v12, 7
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -616,8 +616,8 @@ define <1 x i16> @vror_vv_v1i16(<1 x i16> %a, <1 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -638,8 +638,8 @@ define <1 x i16> @vror_vx_v1i16(<1 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -700,8 +700,8 @@ define <2 x i16> @vror_vv_v2i16(<2 x i16> %a, <2 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -722,8 +722,8 @@ define <2 x i16> @vror_vx_v2i16(<2 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -784,8 +784,8 @@ define <4 x i16> @vror_vv_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -806,8 +806,8 @@ define <4 x i16> @vror_vx_v4i16(<4 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -868,8 +868,8 @@ define <8 x i16> @vror_vv_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -890,8 +890,8 @@ define <8 x i16> @vror_vx_v8i16(<8 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -952,8 +952,8 @@ define <16 x i16> @vror_vv_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vand.vi v12, v10, 15
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -974,8 +974,8 @@ define <16 x i16> @vror_vx_v16i16(<16 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vand.vi v12, v10, 15
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -1037,8 +1037,8 @@ define <32 x i16> @vror_vv_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vand.vi v16, v12, 15
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -1061,8 +1061,8 @@ define <32 x i16> @vror_vx_v32i16(<32 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vand.vi v16, v12, 15
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -1128,12 +1128,12 @@ define <1 x i32> @vror_vv_v1i32(<1 x i32> %a, <1 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vv v9, v8, v9
+; CHECK-NEXT:    vsll.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_v1i32:
@@ -1152,8 +1152,8 @@ define <1 x i32> @vror_vx_v1i32(<1 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -1214,12 +1214,12 @@ define <2 x i32> @vror_vv_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vv v9, v8, v9
+; CHECK-NEXT:    vsll.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_v2i32:
@@ -1238,8 +1238,8 @@ define <2 x i32> @vror_vx_v2i32(<2 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -1300,12 +1300,12 @@ define <4 x i32> @vror_vv_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vv v9, v8, v9
+; CHECK-NEXT:    vsll.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_v4i32:
@@ -1324,8 +1324,8 @@ define <4 x i32> @vror_vx_v4i32(<4 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -1386,12 +1386,12 @@ define <8 x i32> @vror_vv_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vand.vx v12, v10, a0
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
-; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vrsub.vi v12, v10, 0
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
+; CHECK-NEXT:    vsll.vv v8, v8, v12
+; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_v8i32:
@@ -1410,8 +1410,8 @@ define <8 x i32> @vror_vx_v8i32(<8 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v12, v10, a0
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vx v10, v10, a0
 ; CHECK-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -1472,12 +1472,12 @@ define <16 x i32> @vror_vv_v16i32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT:    vand.vx v16, v12, a0
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
-; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vrsub.vi v16, v12, 0
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v12
-; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
+; CHECK-NEXT:    vsll.vv v8, v8, v16
+; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_v16i32:
@@ -1496,8 +1496,8 @@ define <16 x i32> @vror_vx_v16i32(<16 x i32> %a, i32 %b) {
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vand.vx v16, v12, a0
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vx v12, v12, a0
 ; CHECK-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -1558,12 +1558,12 @@ define <1 x i64> @vror_vv_v1i64(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vv v9, v8, v9
+; CHECK-NEXT:    vsll.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_v1i64:
@@ -1582,8 +1582,8 @@ define <1 x i64> @vror_vx_v1i64(<1 x i64> %a, i64 %b) {
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vx v9, v9, a0
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -1605,12 +1605,12 @@ define <1 x i64> @vror_vi_v1i64(<1 x i64> %a) {
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vmv.v.i v9, 1
-; CHECK-RV32-NEXT:    vrsub.vi v9, v9, 0
 ; CHECK-RV32-NEXT:    li a0, 63
-; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
-; CHECK-RV32-NEXT:    vsll.vv v9, v8, v9
+; CHECK-RV32-NEXT:    vrsub.vi v9, v9, 0
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a0
+; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
 ; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
+; CHECK-RV32-NEXT:    vsll.vv v9, v8, v9
 ; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v9
 ; CHECK-RV32-NEXT:    ret
@@ -1638,12 +1638,12 @@ define <1 x i64> @vror_vi_rotl_v1i64(<1 x i64> %a) {
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-RV32-NEXT:    vmv.v.i v9, 1
-; CHECK-RV32-NEXT:    vrsub.vi v9, v9, 0
 ; CHECK-RV32-NEXT:    li a0, 63
-; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
-; CHECK-RV32-NEXT:    vsrl.vv v9, v8, v9
+; CHECK-RV32-NEXT:    vrsub.vi v9, v9, 0
 ; CHECK-RV32-NEXT:    vmv.s.x v10, a0
+; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
 ; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
+; CHECK-RV32-NEXT:    vsrl.vv v9, v8, v9
 ; CHECK-RV32-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v9
 ; CHECK-RV32-NEXT:    ret
@@ -1674,12 +1674,12 @@ define <2 x i64> @vror_vv_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vv v9, v8, v9
+; CHECK-NEXT:    vsll.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_v2i64:
@@ -1701,9 +1701,9 @@ define <2 x i64> @vror_vx_v2i64(<2 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vwsub.vx v11, v10, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
 ; CHECK-RV32-NEXT:    vand.vx v10, v11, a0
 ; CHECK-RV32-NEXT:    vsll.vv v10, v8, v10
-; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
 ; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    ret
@@ -1714,8 +1714,8 @@ define <2 x i64> @vror_vx_v2i64(<2 x i64> %a, i64 %b) {
 ; CHECK-RV64-NEXT:    vmv.v.x v9, a0
 ; CHECK-RV64-NEXT:    li a0, 63
 ; CHECK-RV64-NEXT:    vand.vx v10, v9, a0
-; CHECK-RV64-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-RV64-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-RV64-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-RV64-NEXT:    vand.vx v9, v9, a0
 ; CHECK-RV64-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
@@ -1741,12 +1741,12 @@ define <2 x i64> @vror_vi_v2i64(<2 x i64> %a) {
 ; CHECK-RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vand.vx v9, v10, a0
-; CHECK-RV32-NEXT:    vsll.vv v9, v8, v9
-; CHECK-RV32-NEXT:    vmv.v.x v10, a0
-; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v10
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v9
+; CHECK-RV32-NEXT:    vmv.v.x v9, a0
+; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
+; CHECK-RV32-NEXT:    vand.vi v9, v9, 1
+; CHECK-RV32-NEXT:    vsll.vv v10, v8, v10
+; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v9
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_v2i64:
@@ -1776,12 +1776,12 @@ define <2 x i64> @vror_vi_rotl_v2i64(<2 x i64> %a) {
 ; CHECK-RV32-NEXT:    vwsubu.vx v10, v9, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vand.vx v9, v10, a0
-; CHECK-RV32-NEXT:    vsrl.vv v9, v8, v9
-; CHECK-RV32-NEXT:    vmv.v.x v10, a0
-; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v10
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v9
+; CHECK-RV32-NEXT:    vmv.v.x v9, a0
+; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
+; CHECK-RV32-NEXT:    vand.vi v9, v9, 1
+; CHECK-RV32-NEXT:    vsrl.vv v10, v8, v10
+; CHECK-RV32-NEXT:    vsll.vv v8, v8, v9
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_rotl_v2i64:
@@ -1810,12 +1810,12 @@ define <4 x i64> @vror_vv_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vand.vx v12, v10, a0
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
-; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vrsub.vi v12, v10, 0
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
+; CHECK-NEXT:    vsll.vv v8, v8, v12
+; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_v4i64:
@@ -1837,9 +1837,9 @@ define <4 x i64> @vror_vx_v4i64(<4 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vwsub.vx v14, v12, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
 ; CHECK-RV32-NEXT:    vand.vx v12, v14, a0
 ; CHECK-RV32-NEXT:    vsll.vv v12, v8, v12
-; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
 ; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    ret
@@ -1850,8 +1850,8 @@ define <4 x i64> @vror_vx_v4i64(<4 x i64> %a, i64 %b) {
 ; CHECK-RV64-NEXT:    vmv.v.x v10, a0
 ; CHECK-RV64-NEXT:    li a0, 63
 ; CHECK-RV64-NEXT:    vand.vx v12, v10, a0
-; CHECK-RV64-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-RV64-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-RV64-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-RV64-NEXT:    vand.vx v10, v10, a0
 ; CHECK-RV64-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-RV64-NEXT:    vor.vv v8, v12, v8
@@ -1877,12 +1877,12 @@ define <4 x i64> @vror_vi_v4i64(<4 x i64> %a) {
 ; CHECK-RV32-NEXT:    vwsubu.vx v12, v10, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vand.vx v10, v12, a0
-; CHECK-RV32-NEXT:    vsll.vv v10, v8, v10
-; CHECK-RV32-NEXT:    vmv.v.x v12, a0
-; CHECK-RV32-NEXT:    vand.vi v12, v12, 1
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v12
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
+; CHECK-RV32-NEXT:    vmv.v.x v10, a0
+; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
+; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
+; CHECK-RV32-NEXT:    vsll.vv v12, v8, v12
+; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_v4i64:
@@ -1912,12 +1912,12 @@ define <4 x i64> @vror_vi_rotl_v4i64(<4 x i64> %a) {
 ; CHECK-RV32-NEXT:    vwsubu.vx v12, v10, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-RV32-NEXT:    vand.vx v10, v12, a0
-; CHECK-RV32-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-RV32-NEXT:    vmv.v.x v12, a0
-; CHECK-RV32-NEXT:    vand.vi v12, v12, 1
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v12
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
+; CHECK-RV32-NEXT:    vmv.v.x v10, a0
+; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
+; CHECK-RV32-NEXT:    vand.vi v10, v10, 1
+; CHECK-RV32-NEXT:    vsrl.vv v12, v8, v12
+; CHECK-RV32-NEXT:    vsll.vv v8, v8, v10
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_rotl_v4i64:
@@ -1946,12 +1946,12 @@ define <8 x i64> @vror_vv_v8i64(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vand.vx v16, v12, a0
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
-; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vrsub.vi v16, v12, 0
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v12
-; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
+; CHECK-NEXT:    vsll.vv v8, v8, v16
+; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_v8i64:
@@ -1973,9 +1973,9 @@ define <8 x i64> @vror_vx_v8i64(<8 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vwsub.vx v20, v16, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
 ; CHECK-RV32-NEXT:    vand.vx v16, v20, a0
 ; CHECK-RV32-NEXT:    vsll.vv v16, v8, v16
-; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
 ; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v16
 ; CHECK-RV32-NEXT:    ret
@@ -1986,8 +1986,8 @@ define <8 x i64> @vror_vx_v8i64(<8 x i64> %a, i64 %b) {
 ; CHECK-RV64-NEXT:    vmv.v.x v12, a0
 ; CHECK-RV64-NEXT:    li a0, 63
 ; CHECK-RV64-NEXT:    vand.vx v16, v12, a0
-; CHECK-RV64-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-RV64-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-RV64-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-RV64-NEXT:    vand.vx v12, v12, a0
 ; CHECK-RV64-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-RV64-NEXT:    vor.vv v8, v16, v8
@@ -2013,12 +2013,12 @@ define <8 x i64> @vror_vi_v8i64(<8 x i64> %a) {
 ; CHECK-RV32-NEXT:    vwsubu.vx v16, v12, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vand.vx v12, v16, a0
-; CHECK-RV32-NEXT:    vsll.vv v12, v8, v12
-; CHECK-RV32-NEXT:    vmv.v.x v16, a0
-; CHECK-RV32-NEXT:    vand.vi v16, v16, 1
-; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v16
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
+; CHECK-RV32-NEXT:    vmv.v.x v12, a0
+; CHECK-RV32-NEXT:    vand.vx v16, v16, a0
+; CHECK-RV32-NEXT:    vand.vi v12, v12, 1
+; CHECK-RV32-NEXT:    vsll.vv v16, v8, v16
+; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v12
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v16
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_v8i64:
@@ -2048,12 +2048,12 @@ define <8 x i64> @vror_vi_rotl_v8i64(<8 x i64> %a) {
 ; CHECK-RV32-NEXT:    vwsubu.vx v16, v12, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-RV32-NEXT:    vand.vx v12, v16, a0
-; CHECK-RV32-NEXT:    vsrl.vv v12, v8, v12
-; CHECK-RV32-NEXT:    vmv.v.x v16, a0
-; CHECK-RV32-NEXT:    vand.vi v16, v16, 1
-; CHECK-RV32-NEXT:    vsll.vv v8, v8, v16
-; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
+; CHECK-RV32-NEXT:    vmv.v.x v12, a0
+; CHECK-RV32-NEXT:    vand.vx v16, v16, a0
+; CHECK-RV32-NEXT:    vand.vi v12, v12, 1
+; CHECK-RV32-NEXT:    vsrl.vv v16, v8, v16
+; CHECK-RV32-NEXT:    vsll.vv v8, v8, v12
+; CHECK-RV32-NEXT:    vor.vv v8, v8, v16
 ; CHECK-RV32-NEXT:    ret
 ;
 ; CHECK-RV64-LABEL: vror_vi_rotl_v8i64:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
index 46e794b8a787ba..888fc79f0122da 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
@@ -11,13 +11,13 @@ define <8 x i7> @vsadd_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vsra.vi v9, v9, 1
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    li a1, 63
+; CHECK-NEXT:    vsra.vi v9, v9, 1
 ; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    li a0, 63
-; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vmin.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    li a0, 192
 ; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
index 18251eeef0f236..1d8af4c46cc078 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
@@ -158,48 +158,38 @@ define <256 x i8> @select_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i3
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    sub sp, sp, a2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v9, v8
-; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmv1r.v v6, v8
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    addi a4, a1, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vle8.v v24, (a0)
-; CHECK-NEXT:    addi a0, a1, 128
-; CHECK-NEXT:    vle8.v v16, (a0)
 ; CHECK-NEXT:    addi a0, a3, -128
+; CHECK-NEXT:    vle8.v v8, (a4)
 ; CHECK-NEXT:    sltu a4, a3, a0
-; CHECK-NEXT:    vle8.v v0, (a1)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle8.v v16, (a1)
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a0, a4, a0
-; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v16, v24, v0
+; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
 ; CHECK-NEXT:    bltu a3, a2, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vmv8r.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -219,43 +209,43 @@ define <256 x i8> @select_evl_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c
 ; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vle8.v v24, (a0)
+; CHECK-NEXT:    vle8.v v16, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a0, a1, 128
 ; CHECK-NEXT:    vle8.v v24, (a0)
+; CHECK-NEXT:    vle8.v v16, (a1)
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle8.v v24, (a1)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vmv8r.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
@@ -424,11 +414,11 @@ define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v24, (a1)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    vle64.v v24, (a0)
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    mv a0, a2
@@ -447,7 +437,7 @@ define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -465,36 +455,48 @@ define <32 x i64> @select_evl_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c)
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v24, (a0)
+; CHECK-NEXT:    vle64.v v8, (a0)
+; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v24, v0, 2
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v7, v0, 2
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -611,12 +613,12 @@ define <64 x float> @select_v64f32(<64 x i1> %a, <64 x float> %b, <64 x float> %
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v24, (a1)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle32.v v16, (a1)
 ; CHECK-NEXT:    vle32.v v24, (a0)
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    bltu a2, a3, .LBB35_2
@@ -634,7 +636,7 @@ define <64 x float> @select_v64f32(<64 x i1> %a, <64 x float> %b, <64 x float> %
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
index a3bba2dd8265c0..557882ee31d4cb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll
@@ -9,21 +9,21 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    slli a1, a2, 30
-; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    andi a4, a2, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a4
-; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    slli a1, a2, 29
+; RV32-NEXT:    slli a4, a2, 29
 ; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
 ; RV32-NEXT:    slli a1, a2, 28
+; RV32-NEXT:    srli a4, a4, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
+; RV32-NEXT:    slli a4, a2, 27
+; RV32-NEXT:    srli a2, a2, 5
 ; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    srli a4, a4, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    slli a1, a2, 27
-; RV32-NEXT:    srli a1, a1, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    srli a2, a2, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
 ; RV32-NEXT:    vslide1down.vx v10, v10, a2
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
@@ -39,21 +39,21 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a1)
 ; RV64-NEXT:    slli a1, a2, 62
-; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    andi a4, a2, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a4
-; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    slli a1, a2, 61
+; RV64-NEXT:    slli a4, a2, 61
 ; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
 ; RV64-NEXT:    slli a1, a2, 60
+; RV64-NEXT:    srli a4, a4, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a4
+; RV64-NEXT:    slli a4, a2, 59
+; RV64-NEXT:    srli a2, a2, 5
 ; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    srli a4, a4, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    slli a1, a2, 59
-; RV64-NEXT:    srli a1, a1, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    srli a2, a2, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a4
 ; RV64-NEXT:    vslide1down.vx v10, v10, a2
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
@@ -77,21 +77,21 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    slli a1, a2, 30
-; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    andi a4, a2, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a4
-; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    slli a1, a2, 29
+; RV32-NEXT:    slli a4, a2, 29
 ; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
 ; RV32-NEXT:    slli a1, a2, 28
+; RV32-NEXT:    srli a4, a4, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
+; RV32-NEXT:    slli a4, a2, 27
+; RV32-NEXT:    srli a2, a2, 5
 ; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    srli a4, a4, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    slli a1, a2, 27
-; RV32-NEXT:    srli a1, a1, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    srli a2, a2, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
 ; RV32-NEXT:    vslide1down.vx v10, v10, a2
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
@@ -107,21 +107,21 @@ define void @vselect_vx_v6i32(i32 %a, ptr %b, ptr %cc, ptr %z) {
 ; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a1)
 ; RV64-NEXT:    slli a1, a2, 62
-; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    andi a4, a2, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a4
-; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    slli a1, a2, 61
+; RV64-NEXT:    slli a4, a2, 61
 ; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
 ; RV64-NEXT:    slli a1, a2, 60
+; RV64-NEXT:    srli a4, a4, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a4
+; RV64-NEXT:    slli a4, a2, 59
+; RV64-NEXT:    srli a2, a2, 5
 ; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    srli a4, a4, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    slli a1, a2, 59
-; RV64-NEXT:    srli a1, a1, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    srli a2, a2, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a4
 ; RV64-NEXT:    vslide1down.vx v10, v10, a2
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
@@ -146,21 +146,21 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) {
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    slli a0, a1, 30
-; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    andi a3, a1, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a3
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    slli a0, a1, 29
+; RV32-NEXT:    slli a3, a1, 29
 ; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV32-NEXT:    slli a0, a1, 28
+; RV32-NEXT:    srli a3, a3, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a3
+; RV32-NEXT:    slli a3, a1, 27
+; RV32-NEXT:    srli a1, a1, 5
 ; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    srli a3, a3, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    slli a0, a1, 27
-; RV32-NEXT:    srli a0, a0, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    srli a1, a1, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a3
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
@@ -176,21 +176,21 @@ define void @vselect_vi_v6i32(ptr %b, ptr %cc, ptr %z) {
 ; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
 ; RV64-NEXT:    slli a0, a1, 62
-; RV64-NEXT:    srli a0, a0, 63
 ; RV64-NEXT:    andi a3, a1, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a3
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    slli a0, a1, 61
+; RV64-NEXT:    slli a3, a1, 61
 ; RV64-NEXT:    srli a0, a0, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-NEXT:    slli a0, a1, 60
+; RV64-NEXT:    srli a3, a3, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a3
+; RV64-NEXT:    slli a3, a1, 59
+; RV64-NEXT:    srli a1, a1, 5
 ; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    srli a3, a3, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    slli a0, a1, 59
-; RV64-NEXT:    srli a0, a0, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    srli a1, a1, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a3
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
@@ -214,21 +214,21 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a1)
 ; RV32-NEXT:    slli a1, a2, 30
-; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    andi a4, a2, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a4
-; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    slli a1, a2, 29
+; RV32-NEXT:    slli a4, a2, 29
 ; RV32-NEXT:    srli a1, a1, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
 ; RV32-NEXT:    slli a1, a2, 28
+; RV32-NEXT:    srli a4, a4, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
+; RV32-NEXT:    slli a4, a2, 27
+; RV32-NEXT:    srli a2, a2, 5
 ; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:    srli a4, a4, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    slli a1, a2, 27
-; RV32-NEXT:    srli a1, a1, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a1
-; RV32-NEXT:    srli a2, a2, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a4
 ; RV32-NEXT:    vslide1down.vx v10, v10, a2
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
@@ -244,21 +244,21 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) {
 ; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a1)
 ; RV64-NEXT:    slli a1, a2, 62
-; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    andi a4, a2, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a4
-; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    slli a1, a2, 61
+; RV64-NEXT:    slli a4, a2, 61
 ; RV64-NEXT:    srli a1, a1, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
 ; RV64-NEXT:    slli a1, a2, 60
+; RV64-NEXT:    srli a4, a4, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a4
+; RV64-NEXT:    slli a4, a2, 59
+; RV64-NEXT:    srli a2, a2, 5
 ; RV64-NEXT:    srli a1, a1, 63
+; RV64-NEXT:    srli a4, a4, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    slli a1, a2, 59
-; RV64-NEXT:    srli a1, a1, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a1
-; RV64-NEXT:    srli a2, a2, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a4
 ; RV64-NEXT:    vslide1down.vx v10, v10, a2
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
@@ -282,21 +282,21 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) {
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    slli a0, a1, 30
-; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    andi a3, a1, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a3
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    slli a0, a1, 29
+; RV32-NEXT:    slli a3, a1, 29
 ; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV32-NEXT:    slli a0, a1, 28
+; RV32-NEXT:    srli a3, a3, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a3
+; RV32-NEXT:    slli a3, a1, 27
+; RV32-NEXT:    srli a1, a1, 5
 ; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    srli a3, a3, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    slli a0, a1, 27
-; RV32-NEXT:    srli a0, a0, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    srli a1, a1, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a3
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
@@ -312,21 +312,21 @@ define void @vselect_vx_v6f32(float %a, ptr %b, ptr %cc, ptr %z) {
 ; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
 ; RV64-NEXT:    slli a0, a1, 62
-; RV64-NEXT:    srli a0, a0, 63
 ; RV64-NEXT:    andi a3, a1, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a3
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    slli a0, a1, 61
+; RV64-NEXT:    slli a3, a1, 61
 ; RV64-NEXT:    srli a0, a0, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-NEXT:    slli a0, a1, 60
+; RV64-NEXT:    srli a3, a3, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a3
+; RV64-NEXT:    slli a3, a1, 59
+; RV64-NEXT:    srli a1, a1, 5
 ; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    srli a3, a3, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    slli a0, a1, 59
-; RV64-NEXT:    srli a0, a0, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    srli a1, a1, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a3
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1
@@ -351,21 +351,21 @@ define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) {
 ; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
 ; RV32-NEXT:    slli a0, a1, 30
-; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    andi a3, a1, 1
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.x v10, a3
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    slli a0, a1, 29
+; RV32-NEXT:    slli a3, a1, 29
 ; RV32-NEXT:    srli a0, a0, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a0
 ; RV32-NEXT:    slli a0, a1, 28
+; RV32-NEXT:    srli a3, a3, 31
+; RV32-NEXT:    vslide1down.vx v10, v10, a3
+; RV32-NEXT:    slli a3, a1, 27
+; RV32-NEXT:    srli a1, a1, 5
 ; RV32-NEXT:    srli a0, a0, 31
+; RV32-NEXT:    srli a3, a3, 31
 ; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    slli a0, a1, 27
-; RV32-NEXT:    srli a0, a0, 31
-; RV32-NEXT:    vslide1down.vx v10, v10, a0
-; RV32-NEXT:    srli a1, a1, 5
+; RV32-NEXT:    vslide1down.vx v10, v10, a3
 ; RV32-NEXT:    vslide1down.vx v10, v10, a1
 ; RV32-NEXT:    vslidedown.vi v10, v10, 2
 ; RV32-NEXT:    vand.vi v10, v10, 1
@@ -381,21 +381,21 @@ define void @vselect_vfpzero_v6f32(ptr %b, ptr %cc, ptr %z) {
 ; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
 ; RV64-NEXT:    slli a0, a1, 62
-; RV64-NEXT:    srli a0, a0, 63
 ; RV64-NEXT:    andi a3, a1, 1
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.x v10, a3
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    slli a0, a1, 61
+; RV64-NEXT:    slli a3, a1, 61
 ; RV64-NEXT:    srli a0, a0, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a0
 ; RV64-NEXT:    slli a0, a1, 60
+; RV64-NEXT:    srli a3, a3, 63
+; RV64-NEXT:    vslide1down.vx v10, v10, a3
+; RV64-NEXT:    slli a3, a1, 59
+; RV64-NEXT:    srli a1, a1, 5
 ; RV64-NEXT:    srli a0, a0, 63
+; RV64-NEXT:    srli a3, a3, 63
 ; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    slli a0, a1, 59
-; RV64-NEXT:    srli a0, a0, 63
-; RV64-NEXT:    vslide1down.vx v10, v10, a0
-; RV64-NEXT:    srli a1, a1, 5
+; RV64-NEXT:    vslide1down.vx v10, v10, a3
 ; RV64-NEXT:    vslide1down.vx v10, v10, a1
 ; RV64-NEXT:    vslidedown.vi v10, v10, 2
 ; RV64-NEXT:    vand.vi v10, v10, 1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
index ddf98bab781413..8fad3db55f9bcd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
@@ -11,13 +11,13 @@ define <8 x i7> @vssub_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroe
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    vadd.vv v9, v9, v9
-; CHECK-NEXT:    vsra.vi v9, v9, 1
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    li a1, 63
+; CHECK-NEXT:    vsra.vi v9, v9, 1
 ; CHECK-NEXT:    vsra.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    li a0, 63
-; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vmin.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    li a0, 192
 ; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    ret
@@ -386,12 +386,12 @@ define <256 x i8> @vssub_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %e
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    addi a3, a1, -128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
-; CHECK-NEXT:    addi a0, a1, -128
-; CHECK-NEXT:    sltu a3, a1, a0
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a3, a3, a0
+; CHECK-NEXT:    sltu a0, a1, a3
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a3, a0, a3
 ; CHECK-NEXT:    li a0, -1
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
index 0728bcf0fda584..ca35aa6c4a94c1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
@@ -381,12 +381,12 @@ define <256 x i8> @vssubu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    addi a3, a1, -128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a0)
-; CHECK-NEXT:    addi a0, a1, -128
-; CHECK-NEXT:    sltu a3, a1, a0
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a3, a3, a0
+; CHECK-NEXT:    sltu a0, a1, a3
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a3, a0, a3
 ; CHECK-NEXT:    li a0, -1
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
index d9028b293e60b6..e6dfe5e78cdb4b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
@@ -19,10 +19,10 @@ define <vscale x 1 x bfloat> @vp_floor_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -44,10 +44,10 @@ define <vscale x 1 x bfloat> @vp_floor_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -69,10 +69,10 @@ define <vscale x 2 x bfloat> @vp_floor_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -94,10 +94,10 @@ define <vscale x 2 x bfloat> @vp_floor_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -120,10 +120,10 @@ define <vscale x 4 x bfloat> @vp_floor_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -146,10 +146,10 @@ define <vscale x 4 x bfloat> @vp_floor_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -172,10 +172,10 @@ define <vscale x 8 x bfloat> @vp_floor_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 ; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -198,10 +198,10 @@ define <vscale x 8 x bfloat> @vp_floor_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -224,10 +224,10 @@ define <vscale x 16 x bfloat> @vp_floor_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 ; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
@@ -250,10 +250,10 @@ define <vscale x 16 x bfloat> @vp_floor_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -279,62 +279,54 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    fmv.w.x fa5, a3
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v17, v0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vmv1r.v v0, v17
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24, v0.t
-; CHECK-NEXT:    lui a2, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a2
+; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 2
-; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    vmv1r.v v8, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -354,51 +346,41 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 define <vscale x 32 x bfloat> @vp_floor_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv32bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v16
+; CHECK-NEXT:    fmv.w.x fa5, a3
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v16, v16, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24, v0.t
-; CHECK-NEXT:    lui a2, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a2
+; CHECK-NEXT:    vslidedown.vx v12, v24, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 2
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
@@ -410,12 +392,6 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.floor.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -444,10 +420,10 @@ define <vscale x 1 x half> @vp_floor_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -484,10 +460,10 @@ define <vscale x 1 x half> @vp_floor_nxv1f16_unmasked(<vscale x 1 x half> %va, i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -526,10 +502,10 @@ define <vscale x 2 x half> @vp_floor_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -566,10 +542,10 @@ define <vscale x 2 x half> @vp_floor_nxv2f16_unmasked(<vscale x 2 x half> %va, i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -609,10 +585,10 @@ define <vscale x 4 x half> @vp_floor_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -650,10 +626,10 @@ define <vscale x 4 x half> @vp_floor_nxv4f16_unmasked(<vscale x 4 x half> %va, i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -673,9 +649,9 @@ declare <vscale x 8 x half> @llvm.vp.floor.nxv8f16(<vscale x 8 x half>, <vscale
 define <vscale x 8 x half> @vp_floor_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_nxv8f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI18_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI18_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -695,10 +671,10 @@ define <vscale x 8 x half> @vp_floor_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -736,10 +712,10 @@ define <vscale x 8 x half> @vp_floor_nxv8f16_unmasked(<vscale x 8 x half> %va, i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -759,9 +735,9 @@ declare <vscale x 16 x half> @llvm.vp.floor.nxv16f16(<vscale x 16 x half>, <vsca
 define <vscale x 16 x half> @vp_floor_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_nxv16f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI20_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI20_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
@@ -781,10 +757,10 @@ define <vscale x 16 x half> @vp_floor_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
@@ -822,10 +798,10 @@ define <vscale x 16 x half> @vp_floor_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -845,9 +821,9 @@ declare <vscale x 32 x half> @llvm.vp.floor.nxv32f16(<vscale x 32 x half>, <vsca
 define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_floor_nxv32f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI22_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI22_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
@@ -870,62 +846,54 @@ define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vmv1r.v v16, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    lui a2, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v12, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v8, v16
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -960,51 +928,41 @@ define <vscale x 32 x half> @vp_floor_nxv32f16_unmasked(<vscale x 32 x half> %va
 ;
 ; ZVFHMIN-LABEL: vp_floor_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v16
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    lui a2, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
@@ -1016,12 +974,6 @@ define <vscale x 32 x half> @vp_floor_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.floor.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
@@ -1290,9 +1242,9 @@ declare <vscale x 2 x double> @llvm.vp.floor.nxv2f64(<vscale x 2 x double>, <vsc
 define <vscale x 2 x double> @vp_floor_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv2f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI36_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI36_0)(a1)
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -1334,9 +1286,9 @@ declare <vscale x 4 x double> @llvm.vp.floor.nxv4f64(<vscale x 4 x double>, <vsc
 define <vscale x 4 x double> @vp_floor_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv4f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI38_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI38_0)(a1)
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
@@ -1378,9 +1330,9 @@ declare <vscale x 7 x double> @llvm.vp.floor.nxv7f64(<vscale x 7 x double>, <vsc
 define <vscale x 7 x double> @vp_floor_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv7f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI40_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI40_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -1422,9 +1374,9 @@ declare <vscale x 8 x double> @llvm.vp.floor.nxv8f64(<vscale x 8 x double>, <vsc
 define <vscale x 8 x double> @vp_floor_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv8f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI42_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI42_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -1475,12 +1427,12 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v6, v0, a2
+; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
+; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    lui a3, %hi(.LCPI44_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a3)
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v6, v0, a3
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
@@ -1501,23 +1453,26 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -1533,12 +1488,12 @@ define <vscale x 16 x double> @vp_floor_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-LABEL: vp_floor_nxv16f64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    lui a3, %hi(.LCPI45_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI45_0)(a3)
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    lui a2, %hi(.LCPI45_0)
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
index c2c2beda94a0be..734dd5e33c4fcb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
@@ -144,55 +144,155 @@ define <vscale x 16 x bfloat> @vfmax_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vs
 declare <vscale x 32 x bfloat> @llvm.maximum.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>)
 
 define <vscale x 32 x bfloat> @vfmax_nxv32bf16_vv(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) nounwind {
-; CHECK-LABEL: vfmax_nxv32bf16_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v0, v8
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v3, v24, v24
-; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v3
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v8, v8
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    vfmax.vv v16, v8, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfmax_nxv32bf16_vv:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    mv a1, a0
+; ZVFH-NEXT:    slli a0, a0, 1
+; ZVFH-NEXT:    add a0, a0, a1
+; ZVFH-NEXT:    sub sp, sp, a0
+; ZVFH-NEXT:    vmv8r.v v24, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vmv8r.v v0, v8
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v0
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v3, v16, v16
+; ZVFH-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vmv1r.v v0, v3
+; ZVFH-NEXT:    vmerge.vvm v8, v16, v8, v0
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v4
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfmax.vv v16, v0, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v7, v24, v24
+; ZVFH-NEXT:    vmerge.vvm v16, v8, v24, v0
+; ZVFH-NEXT:    vmv1r.v v0, v7
+; ZVFH-NEXT:    vmerge.vvm v8, v24, v8, v0
+; ZVFH-NEXT:    vfmax.vv v16, v8, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v24
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    mv a1, a0
+; ZVFH-NEXT:    slli a0, a0, 1
+; ZVFH-NEXT:    add a0, a0, a1
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_nxv32bf16_vv:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v0, v8
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v3, v16, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v3
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v8, v0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v4
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmax.vv v16, v0, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v8, v0
+; ZVFHMIN-NEXT:    vfmax.vv v16, v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.maximum.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
   ret <vscale x 32 x bfloat> %v
 }
@@ -395,40 +495,62 @@ define <vscale x 32 x half> @vfmax_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv8r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmfeq.vv v3, v24, v24
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v3, v16, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v8, v0
-; ZVFHMIN-NEXT:    vfmax.vv v8, v8, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v8, v0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v8, v0
+; ZVFHMIN-NEXT:    vfmax.vv v16, v0, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v16, v0
-; ZVFHMIN-NEXT:    vfmax.vv v16, v8, v24
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v8, v0
+; ZVFHMIN-NEXT:    vfmax.vv v16, v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
@@ -436,7 +558,8 @@ define <vscale x 32 x half> @vfmax_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
@@ -632,21 +755,19 @@ define <vscale x 1 x half> @vfmax_nxv1f16_vv_nnana(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v9, v9
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v11, v9, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v10, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
-; ZVFHMIN-NEXT:    vfmax.vv v9, v10, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v11, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -669,14 +790,12 @@ define <vscale x 1 x half> @vfmax_nxv1f16_vv_nnanb(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index 61fa2339570512..e90d3e3f3e8295 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -45,11 +45,9 @@ define <vscale x 1 x bfloat> @vfmax_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v8, v11, v11
 ; CHECK-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
@@ -93,11 +91,9 @@ define <vscale x 2 x bfloat> @vfmax_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v8, v11, v11
 ; CHECK-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; CHECK-NEXT:    vmv.v.v v0, v8
@@ -143,11 +139,9 @@ define <vscale x 4 x bfloat> @vfmax_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v8, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v14, v10, v12, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
@@ -193,11 +187,9 @@ define <vscale x 8 x bfloat> @vfmax_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v8, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v20, v12, v16, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
@@ -264,11 +256,9 @@ define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    addi a0, sp, 16
@@ -302,58 +292,64 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    vmv8r.v v0, v8
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    csrr a4, vlenb
 ; CHECK-NEXT:    slli a4, a4, 5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v24, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v4
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v13, v24, v24, v0.t
 ; CHECK-NEXT:    vmv8r.v v0, v16
-; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    li a4, 24
-; CHECK-NEXT:    mul a2, a2, a4
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    vmfeq.vv v13, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmfeq.vv v12, v16, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
@@ -371,32 +367,27 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; CHECK-NEXT:    vmfeq.vv v24, v16, v16, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
@@ -451,89 +442,114 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v7
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v7, v24, a2
+; CHECK-NEXT:    vslidedown.vx v12, v7, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v13, v24, v24, v0.t
+; CHECK-NEXT:    vmv8r.v v0, v16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    li a4, 24
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v12, v24, v24, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmfeq.vv v13, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a4, 24
-; CHECK-NEXT:    mul a2, a2, a4
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmfeq.vv v12, v24, v24, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v4, v16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v3, v16, v16
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v3
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmax.vv v16, v16, v24
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v0, v16
-; CHECK-NEXT:    vmv8r.v v8, v0
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add sp, sp, a0
@@ -599,11 +615,9 @@ define <vscale x 1 x half> @vfmax_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -671,11 +685,9 @@ define <vscale x 2 x half> @vfmax_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
@@ -745,11 +757,9 @@ define <vscale x 4 x half> @vfmax_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v14, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -821,11 +831,9 @@ define <vscale x 8 x half> @vfmax_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v20, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -918,11 +926,9 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
@@ -986,58 +992,64 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v24, v0
-; ZVFHMIN-NEXT:    vmv8r.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a3, 24
+; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v8, v24, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vmv8r.v v0, v16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a4
+; ZVFHMIN-NEXT:    mul a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    li a3, 24
+; ZVFHMIN-NEXT:    mul a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
@@ -1055,32 +1067,27 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v24, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a2, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
@@ -1146,89 +1153,114 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a3, 24
+; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v7, v24, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v7, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    li a4, 24
+; ZVFHMIN-NEXT:    mul a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv4r.v v8, v16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a4
+; ZVFHMIN-NEXT:    li a3, 24
+; ZVFHMIN-NEXT:    mul a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v24, v16, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a2, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v3, v16, v16
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v3
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v0, v16
-; ZVFHMIN-NEXT:    vmv8r.v v8, v0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -1545,99 +1577,55 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 35
+; CHECK-NEXT:    li a3, 24
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x23, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 35 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
+; CHECK-NEXT:    srli a4, a1, 3
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v6, v0, a4
+; CHECK-NEXT:    sub a4, a2, a1
 ; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    vl8re64.v v24, (a3)
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    li a4, 27
-; CHECK-NEXT:    mul a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    srli a3, a1, 3
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 18
-; CHECK-NEXT:    mul a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v7, v0, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs1r.v v7, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 1
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v8, (a3)
+; CHECK-NEXT:    sltu a3, a2, a4
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a4
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v26, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    li a4, 27
-; CHECK-NEXT:    mul a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v24, v16, v24, v0
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    li a4, 10
-; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    slli a3, a3, 3
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    li a4, 27
-; CHECK-NEXT:    mul a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v16, v24, v24, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs1r.v v16, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v0, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a3, 19
-; CHECK-NEXT:    mul a0, a0, a3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
+; CHECK-NEXT:    vl8re64.v v16, (a0)
+; CHECK-NEXT:    vmv1r.v v0, v26
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a3, 10
-; CHECK-NEXT:    mul a0, a0, a3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v24, v0.t
+; CHECK-NEXT:    vfmax.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a3, 10
-; CHECK-NEXT:    mul a0, a0, a3
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -1645,65 +1633,32 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB40_2:
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 18
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 19
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 27
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 27
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 27
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 10
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 35
+; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
@@ -1731,12 +1686,12 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
+; CHECK-NEXT:    sub a4, a2, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
-; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    sltu a3, a2, a4
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
index 928171e1f9528b..21251ee2f3c630 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
@@ -144,55 +144,155 @@ define <vscale x 16 x bfloat> @vfmin_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vs
 declare <vscale x 32 x bfloat> @llvm.minimum.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>)
 
 define <vscale x 32 x bfloat> @vfmin_nxv32bf16_vv(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) nounwind {
-; CHECK-LABEL: vfmin_nxv32bf16_vv:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v0, v8
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v3, v24, v24
-; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v3
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vmfeq.vv v7, v8, v8
-; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    vfmin.vv v16, v8, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfmin_nxv32bf16_vv:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    mv a1, a0
+; ZVFH-NEXT:    slli a0, a0, 1
+; ZVFH-NEXT:    add a0, a0, a1
+; ZVFH-NEXT:    sub sp, sp, a0
+; ZVFH-NEXT:    vmv8r.v v24, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vmv8r.v v0, v8
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v0
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v3, v16, v16
+; ZVFH-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vmv1r.v v0, v3
+; ZVFH-NEXT:    vmerge.vvm v8, v16, v8, v0
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v4
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfmin.vv v16, v0, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v7, v24, v24
+; ZVFH-NEXT:    vmerge.vvm v16, v8, v24, v0
+; ZVFH-NEXT:    vmv1r.v v0, v7
+; ZVFH-NEXT:    vmerge.vvm v8, v24, v8, v0
+; ZVFH-NEXT:    vfmin.vv v16, v8, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v24
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    mv a1, a0
+; ZVFH-NEXT:    slli a0, a0, 1
+; ZVFH-NEXT:    add a0, a0, a1
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_nxv32bf16_vv:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v0, v8
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v3, v16, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v3
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v8, v0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v4
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmin.vv v16, v0, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v8, v0
+; ZVFHMIN-NEXT:    vfmin.vv v16, v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.minimum.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
   ret <vscale x 32 x bfloat> %v
 }
@@ -395,40 +495,62 @@ define <vscale x 32 x half> @vfmin_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv8r.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
-; ZVFHMIN-NEXT:    vmfeq.vv v3, v24, v24
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v3, v16, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v3
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v8, v0
-; ZVFHMIN-NEXT:    vfmin.vv v8, v8, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v8, v0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vmfeq.vv v7, v8, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v8, v0
+; ZVFHMIN-NEXT:    vfmin.vv v16, v0, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v8, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v16, v0
-; ZVFHMIN-NEXT:    vfmin.vv v16, v8, v24
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v8, v0
+; ZVFHMIN-NEXT:    vfmin.vv v16, v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
@@ -436,7 +558,8 @@ define <vscale x 32 x half> @vfmin_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    ret
@@ -632,21 +755,19 @@ define <vscale x 1 x half> @vfmin_nxv1f16_vv_nnana(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v9, v9
-; ZVFHMIN-NEXT:    vmerge.vvm v10, v11, v9, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v10, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v9, v11, v0
-; ZVFHMIN-NEXT:    vfmin.vv v9, v10, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v10, v11, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -669,14 +790,12 @@ define <vscale x 1 x half> @vfmin_nxv1f16_vv_nnanb(<vscale x 1 x half> %a, <vsca
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v10
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v8, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v9, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index fea3eafaad5d02..818a90607ea073 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -45,11 +45,9 @@ define <vscale x 1 x bfloat> @vfmin_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v8, v11, v11
 ; CHECK-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
@@ -93,11 +91,9 @@ define <vscale x 2 x bfloat> @vfmin_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v8, v11, v11
 ; CHECK-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; CHECK-NEXT:    vmv.v.v v0, v8
@@ -143,11 +139,9 @@ define <vscale x 4 x bfloat> @vfmin_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v10, v10
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v10, v10
 ; CHECK-NEXT:    vmfeq.vv v8, v12, v12
 ; CHECK-NEXT:    vmerge.vvm v14, v10, v12, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
@@ -193,11 +187,9 @@ define <vscale x 8 x bfloat> @vfmin_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v12, v12
-; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v12, v12
 ; CHECK-NEXT:    vmfeq.vv v8, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v20, v12, v16, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
@@ -264,11 +256,9 @@ define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    addi a0, sp, 16
@@ -302,58 +292,64 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    vmv8r.v v0, v8
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    csrr a4, vlenb
 ; CHECK-NEXT:    slli a4, a4, 5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v24, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v4
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v13, v24, v24, v0.t
 ; CHECK-NEXT:    vmv8r.v v0, v16
-; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    li a4, 24
-; CHECK-NEXT:    mul a2, a2, a4
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    vmfeq.vv v13, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmfeq.vv v12, v16, v16, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
@@ -371,32 +367,27 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; CHECK-NEXT:    vmfeq.vv v24, v16, v16, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
@@ -451,89 +442,114 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v7
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v7, v24, a2
+; CHECK-NEXT:    vslidedown.vx v12, v7, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v13, v24, v24, v0.t
+; CHECK-NEXT:    vmv8r.v v0, v16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    li a4, 24
+; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v12, v24, v24, v0.t
-; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmfeq.vv v13, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a4, 24
-; CHECK-NEXT:    mul a2, a2, a4
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmfeq.vv v12, v24, v24, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v4, v16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v3, v16, v16
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v3
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmin.vv v16, v16, v24
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v0, v16
-; CHECK-NEXT:    vmv8r.v v8, v0
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add sp, sp, a0
@@ -599,11 +615,9 @@ define <vscale x 1 x half> @vfmin_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -671,11 +685,9 @@ define <vscale x 2 x half> @vfmin_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
 ; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
@@ -745,11 +757,9 @@ define <vscale x 4 x half> @vfmin_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
 ; ZVFHMIN-NEXT:    vmerge.vvm v14, v10, v12, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -821,11 +831,9 @@ define <vscale x 8 x half> @vfmin_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v20, v12, v16, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
@@ -918,11 +926,9 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
@@ -986,58 +992,64 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v24, v0
-; ZVFHMIN-NEXT:    vmv8r.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a3, 24
+; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v8, v24, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vmv8r.v v0, v16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a4
+; ZVFHMIN-NEXT:    mul a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    li a3, 24
+; ZVFHMIN-NEXT:    mul a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
@@ -1055,32 +1067,27 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v24, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a2, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
@@ -1146,89 +1153,114 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a3, 24
+; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v7, v24, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v7, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    li a4, 24
+; ZVFHMIN-NEXT:    mul a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv4r.v v8, v16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v13
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a4
+; ZVFHMIN-NEXT:    li a3, 24
+; ZVFHMIN-NEXT:    mul a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    vmerge.vvm v16, v24, v16, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v8, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a2, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v3, v16, v16
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v3
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v0, v16
-; ZVFHMIN-NEXT:    vmv8r.v v8, v0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -1545,99 +1577,55 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 35
+; CHECK-NEXT:    li a3, 24
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x23, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 35 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
+; CHECK-NEXT:    srli a4, a1, 3
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v6, v0, a4
+; CHECK-NEXT:    sub a4, a2, a1
 ; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    vl8re64.v v24, (a3)
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    li a4, 27
-; CHECK-NEXT:    mul a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    srli a3, a1, 3
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 18
-; CHECK-NEXT:    mul a4, a4, a5
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v7, v0, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs1r.v v7, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 1
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v8, (a3)
+; CHECK-NEXT:    sltu a3, a2, a4
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a4
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v26, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    li a4, 27
-; CHECK-NEXT:    mul a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v24, v16, v24, v0
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    li a4, 10
-; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    slli a3, a3, 3
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    li a4, 27
-; CHECK-NEXT:    mul a3, a3, a4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v16, v24, v24, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs1r.v v16, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v0, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a3, 19
-; CHECK-NEXT:    mul a0, a0, a3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
+; CHECK-NEXT:    vl8re64.v v16, (a0)
+; CHECK-NEXT:    vmv1r.v v0, v26
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a3, 10
-; CHECK-NEXT:    mul a0, a0, a3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v24, v0.t
+; CHECK-NEXT:    vfmin.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a3, 10
-; CHECK-NEXT:    mul a0, a0, a3
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -1645,65 +1633,32 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB40_2:
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 18
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 19
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 27
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 27
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 27
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 10
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 35
+; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
@@ -1731,12 +1686,12 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
+; CHECK-NEXT:    sub a4, a2, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v24, (a3)
-; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    sltu a3, a2, a4
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
index f22cd77db7a40b..7a4695d1c25c1b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll
@@ -155,10 +155,10 @@ define <vscale x 1 x float> @nearbyint_nxv1f32(<vscale x 1 x float> %v) strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -179,10 +179,10 @@ define <vscale x 2 x float> @nearbyint_nxv2f32(<vscale x 2 x float> %v) strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -203,10 +203,10 @@ define <vscale x 4 x float> @nearbyint_nxv4f32(<vscale x 4 x float> %v) strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -227,10 +227,10 @@ define <vscale x 8 x float> @nearbyint_nxv8f32(<vscale x 8 x float> %v) strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -251,10 +251,10 @@ define <vscale x 16 x float> @nearbyint_nxv16f32(<vscale x 16 x float> %v) stric
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
index 7b3a2ef1727907..807a3e460b153a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll
@@ -17,9 +17,9 @@ define <vscale x 1 x bfloat> @nearbyint_nxv1bf16(<vscale x 1 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    frflags a0
@@ -40,9 +40,9 @@ define <vscale x 2 x bfloat> @nearbyint_nxv2bf16(<vscale x 2 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    frflags a0
@@ -63,9 +63,9 @@ define <vscale x 4 x bfloat> @nearbyint_nxv4bf16(<vscale x 4 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    frflags a0
@@ -86,9 +86,9 @@ define <vscale x 8 x bfloat> @nearbyint_nxv8bf16(<vscale x 8 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    frflags a0
@@ -109,9 +109,9 @@ define <vscale x 16 x bfloat> @nearbyint_nxv16bf16(<vscale x 16 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    frflags a0
@@ -138,23 +138,23 @@ define <vscale x 32 x bfloat> @nearbyint_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    frflags a0
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; CHECK-NEXT:    fsflags a0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24
+; CHECK-NEXT:    vfabs.v v8, v16
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    vfabs.v v8, v24
+; CHECK-NEXT:    vmflt.vf v7, v8, fa5
+; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    fsflags a0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
+; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
 ; CHECK-NEXT:    frflags a0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -199,9 +199,9 @@ define <vscale x 1 x half> @nearbyint_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    frflags a0
@@ -238,9 +238,9 @@ define <vscale x 2 x half> @nearbyint_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    frflags a0
@@ -277,9 +277,9 @@ define <vscale x 4 x half> @nearbyint_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    frflags a0
@@ -316,9 +316,9 @@ define <vscale x 8 x half> @nearbyint_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    frflags a0
@@ -355,9 +355,9 @@ define <vscale x 16 x half> @nearbyint_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    frflags a0
@@ -400,23 +400,23 @@ define <vscale x 32 x half> @nearbyint_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
-; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; ZVFHMIN-NEXT:    frflags a0
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
-; ZVFHMIN-NEXT:    fsflags a0
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24
+; ZVFHMIN-NEXT:    vfabs.v v8, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT:    vfabs.v v8, v24
+; ZVFHMIN-NEXT:    vmflt.vf v7, v8, fa5
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    fsflags a0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    frflags a0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll b/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll
index 5bc1ab9820d6cc..2fda344690bfc6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll
@@ -282,10 +282,10 @@ define float @reduce_fadd4(float %x, float %y, <4 x float> %v, <4 x float> %w) {
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
 ; CHECK-NEXT:    vfredusum.vs v8, v8, v10
+; CHECK-NEXT:    vfmv.s.f v10, fa1
+; CHECK-NEXT:    vfredusum.vs v9, v9, v10
 ; CHECK-NEXT:    vfmv.f.s fa5, v8
-; CHECK-NEXT:    vfmv.s.f v8, fa1
-; CHECK-NEXT:    vfredusum.vs v8, v9, v8
-; CHECK-NEXT:    vfmv.f.s fa4, v8
+; CHECK-NEXT:    vfmv.f.s fa4, v9
 ; CHECK-NEXT:    fdiv.s fa0, fa5, fa4
 ; CHECK-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll b/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll
index 4aa26d6b79ca46..025874a1a74e2e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fold-scalar-load-crash.ll
@@ -7,10 +7,10 @@
 define i32 @test(i32 %size, ptr %add.ptr, i64 %const) {
 ; RV32-LABEL: test:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    th.lbib a3, (a1), -1, 0
-; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    addi a3, a2, 1
+; RV32-NEXT:    th.lbib a4, (a1), -1, 0
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a4
 ; RV32-NEXT:    vmv.s.x v9, zero
 ; RV32-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
 ; RV32-NEXT:    vslideup.vx v8, v9, a2
@@ -33,10 +33,10 @@ define i32 @test(i32 %size, ptr %add.ptr, i64 %const) {
 ;
 ; RV64-LABEL: test:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    th.lbib a3, (a1), -1, 0
-; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a3
 ; RV64-NEXT:    addi a3, a2, 1
+; RV64-NEXT:    th.lbib a4, (a1), -1, 0
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a4
 ; RV64-NEXT:    vmv.s.x v9, zero
 ; RV64-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
 ; RV64-NEXT:    vslideup.vx v8, v9, a2

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
index 3cb1be1b7e71a8..f6598606b09f1a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll
@@ -104,8 +104,8 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    sgtz a2, a1
 ; CHECK-NOV-NEXT:    sgtz a3, a0
 ; CHECK-NOV-NEXT:    neg a3, a3
-; CHECK-NOV-NEXT:    and a0, a3, a0
 ; CHECK-NOV-NEXT:    neg a2, a2
+; CHECK-NOV-NEXT:    and a0, a3, a0
 ; CHECK-NOV-NEXT:    and a1, a2, a1
 ; CHECK-NOV-NEXT:    ret
 ;
@@ -275,12 +275,12 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) {
 ; CHECK-NOV-NEXT:    sgtz a7, a3
 ; CHECK-NOV-NEXT:    sgtz t0, a5
 ; CHECK-NOV-NEXT:    negw t0, t0
-; CHECK-NOV-NEXT:    and a5, t0, a5
 ; CHECK-NOV-NEXT:    negw a7, a7
-; CHECK-NOV-NEXT:    and a3, a7, a3
 ; CHECK-NOV-NEXT:    negw a6, a6
-; CHECK-NOV-NEXT:    and a2, a6, a2
 ; CHECK-NOV-NEXT:    negw a4, a4
+; CHECK-NOV-NEXT:    and a5, t0, a5
+; CHECK-NOV-NEXT:    and a3, a7, a3
+; CHECK-NOV-NEXT:    and a2, a6, a2
 ; CHECK-NOV-NEXT:    and a1, a4, a1
 ; CHECK-NOV-NEXT:    sw a5, 0(a0)
 ; CHECK-NOV-NEXT:    sw a3, 4(a0)
@@ -756,12 +756,12 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) {
 ; CHECK-NOV-NEXT:    sgtz a5, a1
 ; CHECK-NOV-NEXT:    sgtz a6, a3
 ; CHECK-NOV-NEXT:    negw a6, a6
-; CHECK-NOV-NEXT:    and a3, a6, a3
 ; CHECK-NOV-NEXT:    negw a5, a5
-; CHECK-NOV-NEXT:    and a1, a5, a1
 ; CHECK-NOV-NEXT:    negw a4, a4
-; CHECK-NOV-NEXT:    and a4, a4, s1
 ; CHECK-NOV-NEXT:    negw a2, a2
+; CHECK-NOV-NEXT:    and a3, a6, a3
+; CHECK-NOV-NEXT:    and a1, a5, a1
+; CHECK-NOV-NEXT:    and a4, a4, s1
 ; CHECK-NOV-NEXT:    and a0, a2, a0
 ; CHECK-NOV-NEXT:    sw a3, 0(s0)
 ; CHECK-NOV-NEXT:    sw a1, 4(s0)
@@ -992,8 +992,8 @@ define <2 x i16> @ustest_f64i16(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    sgtz a2, a1
 ; CHECK-NOV-NEXT:    sgtz a3, a0
 ; CHECK-NOV-NEXT:    neg a3, a3
-; CHECK-NOV-NEXT:    and a0, a3, a0
 ; CHECK-NOV-NEXT:    neg a2, a2
+; CHECK-NOV-NEXT:    and a0, a3, a0
 ; CHECK-NOV-NEXT:    and a1, a2, a1
 ; CHECK-NOV-NEXT:    ret
 ;
@@ -1167,12 +1167,12 @@ define <4 x i16> @ustest_f32i16(<4 x float> %x) {
 ; CHECK-NOV-NEXT:    sgtz a7, a3
 ; CHECK-NOV-NEXT:    sgtz t0, a5
 ; CHECK-NOV-NEXT:    negw t0, t0
-; CHECK-NOV-NEXT:    and a5, t0, a5
 ; CHECK-NOV-NEXT:    negw a7, a7
-; CHECK-NOV-NEXT:    and a3, a7, a3
 ; CHECK-NOV-NEXT:    negw a6, a6
-; CHECK-NOV-NEXT:    and a2, a6, a2
 ; CHECK-NOV-NEXT:    negw a4, a4
+; CHECK-NOV-NEXT:    and a5, t0, a5
+; CHECK-NOV-NEXT:    and a3, a7, a3
+; CHECK-NOV-NEXT:    and a2, a6, a2
 ; CHECK-NOV-NEXT:    and a1, a4, a1
 ; CHECK-NOV-NEXT:    sh a5, 0(a0)
 ; CHECK-NOV-NEXT:    sh a3, 2(a0)
@@ -2006,56 +2006,56 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    fcvt.l.s s1, fs6, rtz
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    fcvt.l.s a0, fa0, rtz
-; CHECK-NOV-NEXT:    lui a3, 16
-; CHECK-NOV-NEXT:    addiw a3, a3, -1
-; CHECK-NOV-NEXT:    bge a0, a3, .LBB17_10
+; CHECK-NOV-NEXT:    lui a4, 16
+; CHECK-NOV-NEXT:    addiw a4, a4, -1
+; CHECK-NOV-NEXT:    bge a0, a4, .LBB17_10
 ; CHECK-NOV-NEXT:  # %bb.1: # %entry
 ; CHECK-NOV-NEXT:    fcvt.l.s a1, fs5, rtz
-; CHECK-NOV-NEXT:    bge s1, a3, .LBB17_11
+; CHECK-NOV-NEXT:    bge s1, a4, .LBB17_11
 ; CHECK-NOV-NEXT:  .LBB17_2: # %entry
 ; CHECK-NOV-NEXT:    fcvt.l.s a2, fs4, rtz
-; CHECK-NOV-NEXT:    bge a1, a3, .LBB17_12
+; CHECK-NOV-NEXT:    bge a1, a4, .LBB17_12
 ; CHECK-NOV-NEXT:  .LBB17_3: # %entry
-; CHECK-NOV-NEXT:    fcvt.l.s a4, fs3, rtz
-; CHECK-NOV-NEXT:    bge a2, a3, .LBB17_13
+; CHECK-NOV-NEXT:    fcvt.l.s a3, fs3, rtz
+; CHECK-NOV-NEXT:    bge a2, a4, .LBB17_13
 ; CHECK-NOV-NEXT:  .LBB17_4: # %entry
 ; CHECK-NOV-NEXT:    fcvt.l.s a5, fs2, rtz
-; CHECK-NOV-NEXT:    bge a4, a3, .LBB17_14
+; CHECK-NOV-NEXT:    bge a3, a4, .LBB17_14
 ; CHECK-NOV-NEXT:  .LBB17_5: # %entry
 ; CHECK-NOV-NEXT:    fcvt.l.s a6, fs1, rtz
-; CHECK-NOV-NEXT:    bge a5, a3, .LBB17_15
+; CHECK-NOV-NEXT:    bge a5, a4, .LBB17_15
 ; CHECK-NOV-NEXT:  .LBB17_6: # %entry
 ; CHECK-NOV-NEXT:    fcvt.l.s a7, fs0, rtz
-; CHECK-NOV-NEXT:    bge a6, a3, .LBB17_16
+; CHECK-NOV-NEXT:    bge a6, a4, .LBB17_16
 ; CHECK-NOV-NEXT:  .LBB17_7: # %entry
-; CHECK-NOV-NEXT:    blt a7, a3, .LBB17_9
+; CHECK-NOV-NEXT:    blt a7, a4, .LBB17_9
 ; CHECK-NOV-NEXT:  .LBB17_8: # %entry
-; CHECK-NOV-NEXT:    mv a7, a3
+; CHECK-NOV-NEXT:    mv a7, a4
 ; CHECK-NOV-NEXT:  .LBB17_9: # %entry
-; CHECK-NOV-NEXT:    sgtz a3, a0
+; CHECK-NOV-NEXT:    sgtz a4, a0
 ; CHECK-NOV-NEXT:    sgtz t0, s1
 ; CHECK-NOV-NEXT:    sgtz t1, a1
 ; CHECK-NOV-NEXT:    sgtz t2, a2
-; CHECK-NOV-NEXT:    sgtz t3, a4
+; CHECK-NOV-NEXT:    sgtz t3, a3
 ; CHECK-NOV-NEXT:    sgtz t4, a5
 ; CHECK-NOV-NEXT:    sgtz t5, a6
 ; CHECK-NOV-NEXT:    sgtz t6, a7
 ; CHECK-NOV-NEXT:    negw t6, t6
-; CHECK-NOV-NEXT:    and a7, t6, a7
 ; CHECK-NOV-NEXT:    negw t5, t5
-; CHECK-NOV-NEXT:    and a6, t5, a6
 ; CHECK-NOV-NEXT:    negw t4, t4
-; CHECK-NOV-NEXT:    and a5, t4, a5
 ; CHECK-NOV-NEXT:    negw t3, t3
-; CHECK-NOV-NEXT:    and a4, t3, a4
 ; CHECK-NOV-NEXT:    negw t2, t2
-; CHECK-NOV-NEXT:    and a2, t2, a2
 ; CHECK-NOV-NEXT:    negw t1, t1
-; CHECK-NOV-NEXT:    and a1, t1, a1
 ; CHECK-NOV-NEXT:    negw t0, t0
+; CHECK-NOV-NEXT:    negw a4, a4
+; CHECK-NOV-NEXT:    and a7, t6, a7
+; CHECK-NOV-NEXT:    and a6, t5, a6
+; CHECK-NOV-NEXT:    and a5, t4, a5
+; CHECK-NOV-NEXT:    and a3, t3, a3
+; CHECK-NOV-NEXT:    and a2, t2, a2
+; CHECK-NOV-NEXT:    and a1, t1, a1
 ; CHECK-NOV-NEXT:    and t0, t0, s1
-; CHECK-NOV-NEXT:    negw a3, a3
-; CHECK-NOV-NEXT:    and a0, a3, a0
+; CHECK-NOV-NEXT:    and a0, a4, a0
 ; CHECK-NOV-NEXT:    sh a2, 8(s0)
 ; CHECK-NOV-NEXT:    sh a1, 10(s0)
 ; CHECK-NOV-NEXT:    sh t0, 12(s0)
@@ -2063,7 +2063,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    sh a7, 0(s0)
 ; CHECK-NOV-NEXT:    sh a6, 2(s0)
 ; CHECK-NOV-NEXT:    sh a5, 4(s0)
-; CHECK-NOV-NEXT:    sh a4, 6(s0)
+; CHECK-NOV-NEXT:    sh a3, 6(s0)
 ; CHECK-NOV-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 104(sp) # 8-byte Folded Reload
@@ -2101,32 +2101,32 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
 ; CHECK-NOV-NEXT:    ret
 ; CHECK-NOV-NEXT:  .LBB17_10: # %entry
 ; CHECK-NOV-NEXT:    .cfi_restore_state
-; CHECK-NOV-NEXT:    mv a0, a3
+; CHECK-NOV-NEXT:    mv a0, a4
 ; CHECK-NOV-NEXT:    fcvt.l.s a1, fs5, rtz
-; CHECK-NOV-NEXT:    blt s1, a3, .LBB17_2
+; CHECK-NOV-NEXT:    blt s1, a4, .LBB17_2
 ; CHECK-NOV-NEXT:  .LBB17_11: # %entry
-; CHECK-NOV-NEXT:    mv s1, a3
+; CHECK-NOV-NEXT:    mv s1, a4
 ; CHECK-NOV-NEXT:    fcvt.l.s a2, fs4, rtz
-; CHECK-NOV-NEXT:    blt a1, a3, .LBB17_3
+; CHECK-NOV-NEXT:    blt a1, a4, .LBB17_3
 ; CHECK-NOV-NEXT:  .LBB17_12: # %entry
-; CHECK-NOV-NEXT:    mv a1, a3
-; CHECK-NOV-NEXT:    fcvt.l.s a4, fs3, rtz
-; CHECK-NOV-NEXT:    blt a2, a3, .LBB17_4
+; CHECK-NOV-NEXT:    mv a1, a4
+; CHECK-NOV-NEXT:    fcvt.l.s a3, fs3, rtz
+; CHECK-NOV-NEXT:    blt a2, a4, .LBB17_4
 ; CHECK-NOV-NEXT:  .LBB17_13: # %entry
-; CHECK-NOV-NEXT:    mv a2, a3
+; CHECK-NOV-NEXT:    mv a2, a4
 ; CHECK-NOV-NEXT:    fcvt.l.s a5, fs2, rtz
-; CHECK-NOV-NEXT:    blt a4, a3, .LBB17_5
+; CHECK-NOV-NEXT:    blt a3, a4, .LBB17_5
 ; CHECK-NOV-NEXT:  .LBB17_14: # %entry
-; CHECK-NOV-NEXT:    mv a4, a3
+; CHECK-NOV-NEXT:    mv a3, a4
 ; CHECK-NOV-NEXT:    fcvt.l.s a6, fs1, rtz
-; CHECK-NOV-NEXT:    blt a5, a3, .LBB17_6
+; CHECK-NOV-NEXT:    blt a5, a4, .LBB17_6
 ; CHECK-NOV-NEXT:  .LBB17_15: # %entry
-; CHECK-NOV-NEXT:    mv a5, a3
+; CHECK-NOV-NEXT:    mv a5, a4
 ; CHECK-NOV-NEXT:    fcvt.l.s a7, fs0, rtz
-; CHECK-NOV-NEXT:    blt a6, a3, .LBB17_7
+; CHECK-NOV-NEXT:    blt a6, a4, .LBB17_7
 ; CHECK-NOV-NEXT:  .LBB17_16: # %entry
-; CHECK-NOV-NEXT:    mv a6, a3
-; CHECK-NOV-NEXT:    bge a7, a3, .LBB17_8
+; CHECK-NOV-NEXT:    mv a6, a4
+; CHECK-NOV-NEXT:    bge a7, a4, .LBB17_8
 ; CHECK-NOV-NEXT:    j .LBB17_9
 ;
 ; CHECK-V-LABEL: ustest_f16i16:
@@ -2510,8 +2510,8 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    snez a1, a1
 ; CHECK-NOV-NEXT:    snez a2, s1
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
-; CHECK-NOV-NEXT:    and a2, a2, s0
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
+; CHECK-NOV-NEXT:    and a2, a2, s0
 ; CHECK-NOV-NEXT:    and a1, a1, a0
 ; CHECK-NOV-NEXT:    mv a0, a2
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -2552,8 +2552,8 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    snez a1, a1
 ; CHECK-V-NEXT:    snez a2, s1
 ; CHECK-V-NEXT:    addi a2, a2, -1
-; CHECK-V-NEXT:    and a2, a2, s0
 ; CHECK-V-NEXT:    addi a1, a1, -1
+; CHECK-V-NEXT:    and a2, a2, s0
 ; CHECK-V-NEXT:    and a0, a1, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
@@ -2614,23 +2614,23 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    and a3, a3, a0
 ; CHECK-NOV-NEXT:    beqz a1, .LBB20_7
 ; CHECK-NOV-NEXT:  # %bb.5: # %entry
-; CHECK-NOV-NEXT:    sgtz a1, a1
-; CHECK-NOV-NEXT:    and a4, a4, s0
+; CHECK-NOV-NEXT:    sgtz a0, a1
+; CHECK-NOV-NEXT:    and a1, a4, s0
 ; CHECK-NOV-NEXT:    bnez a2, .LBB20_8
 ; CHECK-NOV-NEXT:  .LBB20_6:
-; CHECK-NOV-NEXT:    snez a0, a4
+; CHECK-NOV-NEXT:    snez a2, a1
 ; CHECK-NOV-NEXT:    j .LBB20_9
 ; CHECK-NOV-NEXT:  .LBB20_7:
-; CHECK-NOV-NEXT:    snez a1, a3
-; CHECK-NOV-NEXT:    and a4, a4, s0
+; CHECK-NOV-NEXT:    snez a0, a3
+; CHECK-NOV-NEXT:    and a1, a4, s0
 ; CHECK-NOV-NEXT:    beqz a2, .LBB20_6
 ; CHECK-NOV-NEXT:  .LBB20_8: # %entry
-; CHECK-NOV-NEXT:    sgtz a0, a2
+; CHECK-NOV-NEXT:    sgtz a2, a2
 ; CHECK-NOV-NEXT:  .LBB20_9: # %entry
-; CHECK-NOV-NEXT:    neg a0, a0
-; CHECK-NOV-NEXT:    and a0, a0, a4
-; CHECK-NOV-NEXT:    neg a1, a1
-; CHECK-NOV-NEXT:    and a1, a1, a3
+; CHECK-NOV-NEXT:    neg a2, a2
+; CHECK-NOV-NEXT:    neg a4, a0
+; CHECK-NOV-NEXT:    and a0, a2, a1
+; CHECK-NOV-NEXT:    and a1, a4, a3
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -2696,8 +2696,8 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) {
 ; CHECK-V-NEXT:    sgtz a2, a2
 ; CHECK-V-NEXT:  .LBB20_9: # %entry
 ; CHECK-V-NEXT:    neg a2, a2
-; CHECK-V-NEXT:    and a2, a2, a3
 ; CHECK-V-NEXT:    neg a1, a1
+; CHECK-V-NEXT:    and a2, a2, a3
 ; CHECK-V-NEXT:    and a0, a1, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
@@ -2932,8 +2932,8 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    snez a1, a1
 ; CHECK-NOV-NEXT:    snez a2, s1
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
-; CHECK-NOV-NEXT:    and a2, a2, s0
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
+; CHECK-NOV-NEXT:    and a2, a2, s0
 ; CHECK-NOV-NEXT:    and a1, a1, a0
 ; CHECK-NOV-NEXT:    mv a0, a2
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -2974,8 +2974,8 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    snez a1, a1
 ; CHECK-V-NEXT:    snez a2, s1
 ; CHECK-V-NEXT:    addi a2, a2, -1
-; CHECK-V-NEXT:    and a2, a2, s0
 ; CHECK-V-NEXT:    addi a1, a1, -1
+; CHECK-V-NEXT:    and a2, a2, s0
 ; CHECK-V-NEXT:    and a0, a1, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
@@ -3036,23 +3036,23 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    and a3, a3, a0
 ; CHECK-NOV-NEXT:    beqz a1, .LBB23_7
 ; CHECK-NOV-NEXT:  # %bb.5: # %entry
-; CHECK-NOV-NEXT:    sgtz a1, a1
-; CHECK-NOV-NEXT:    and a4, a4, s0
+; CHECK-NOV-NEXT:    sgtz a0, a1
+; CHECK-NOV-NEXT:    and a1, a4, s0
 ; CHECK-NOV-NEXT:    bnez a2, .LBB23_8
 ; CHECK-NOV-NEXT:  .LBB23_6:
-; CHECK-NOV-NEXT:    snez a0, a4
+; CHECK-NOV-NEXT:    snez a2, a1
 ; CHECK-NOV-NEXT:    j .LBB23_9
 ; CHECK-NOV-NEXT:  .LBB23_7:
-; CHECK-NOV-NEXT:    snez a1, a3
-; CHECK-NOV-NEXT:    and a4, a4, s0
+; CHECK-NOV-NEXT:    snez a0, a3
+; CHECK-NOV-NEXT:    and a1, a4, s0
 ; CHECK-NOV-NEXT:    beqz a2, .LBB23_6
 ; CHECK-NOV-NEXT:  .LBB23_8: # %entry
-; CHECK-NOV-NEXT:    sgtz a0, a2
+; CHECK-NOV-NEXT:    sgtz a2, a2
 ; CHECK-NOV-NEXT:  .LBB23_9: # %entry
-; CHECK-NOV-NEXT:    neg a0, a0
-; CHECK-NOV-NEXT:    and a0, a0, a4
-; CHECK-NOV-NEXT:    neg a1, a1
-; CHECK-NOV-NEXT:    and a1, a1, a3
+; CHECK-NOV-NEXT:    neg a2, a2
+; CHECK-NOV-NEXT:    neg a4, a0
+; CHECK-NOV-NEXT:    and a0, a2, a1
+; CHECK-NOV-NEXT:    and a1, a4, a3
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -3118,8 +3118,8 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) {
 ; CHECK-V-NEXT:    sgtz a2, a2
 ; CHECK-V-NEXT:  .LBB23_9: # %entry
 ; CHECK-V-NEXT:    neg a2, a2
-; CHECK-V-NEXT:    and a2, a2, a3
 ; CHECK-V-NEXT:    neg a1, a1
+; CHECK-V-NEXT:    and a2, a2, a3
 ; CHECK-V-NEXT:    and a0, a1, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
@@ -3357,8 +3357,8 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    snez a1, a1
 ; CHECK-NOV-NEXT:    snez a2, s2
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
-; CHECK-NOV-NEXT:    and a2, a2, s1
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
+; CHECK-NOV-NEXT:    and a2, a2, s1
 ; CHECK-NOV-NEXT:    and a1, a1, a0
 ; CHECK-NOV-NEXT:    mv a0, a2
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -3397,8 +3397,8 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) {
 ; CHECK-V-NEXT:    snez a1, a1
 ; CHECK-V-NEXT:    snez a2, s2
 ; CHECK-V-NEXT:    addi a2, a2, -1
-; CHECK-V-NEXT:    and a2, a2, s1
 ; CHECK-V-NEXT:    addi a1, a1, -1
+; CHECK-V-NEXT:    and a2, a2, s1
 ; CHECK-V-NEXT:    and a0, a1, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v9, a0
@@ -3461,23 +3461,23 @@ define <2 x i64> @ustest_f16i64(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    and a3, a3, a0
 ; CHECK-NOV-NEXT:    beqz a1, .LBB26_7
 ; CHECK-NOV-NEXT:  # %bb.5: # %entry
-; CHECK-NOV-NEXT:    sgtz a1, a1
-; CHECK-NOV-NEXT:    and a4, a4, s0
+; CHECK-NOV-NEXT:    sgtz a0, a1
+; CHECK-NOV-NEXT:    and a1, a4, s0
 ; CHECK-NOV-NEXT:    bnez a2, .LBB26_8
 ; CHECK-NOV-NEXT:  .LBB26_6:
-; CHECK-NOV-NEXT:    snez a0, a4
+; CHECK-NOV-NEXT:    snez a2, a1
 ; CHECK-NOV-NEXT:    j .LBB26_9
 ; CHECK-NOV-NEXT:  .LBB26_7:
-; CHECK-NOV-NEXT:    snez a1, a3
-; CHECK-NOV-NEXT:    and a4, a4, s0
+; CHECK-NOV-NEXT:    snez a0, a3
+; CHECK-NOV-NEXT:    and a1, a4, s0
 ; CHECK-NOV-NEXT:    beqz a2, .LBB26_6
 ; CHECK-NOV-NEXT:  .LBB26_8: # %entry
-; CHECK-NOV-NEXT:    sgtz a0, a2
+; CHECK-NOV-NEXT:    sgtz a2, a2
 ; CHECK-NOV-NEXT:  .LBB26_9: # %entry
-; CHECK-NOV-NEXT:    neg a0, a0
-; CHECK-NOV-NEXT:    and a0, a0, a4
-; CHECK-NOV-NEXT:    neg a1, a1
-; CHECK-NOV-NEXT:    and a1, a1, a3
+; CHECK-NOV-NEXT:    neg a2, a2
+; CHECK-NOV-NEXT:    neg a4, a0
+; CHECK-NOV-NEXT:    and a0, a2, a1
+; CHECK-NOV-NEXT:    and a1, a4, a3
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -3541,8 +3541,8 @@ define <2 x i64> @ustest_f16i64(<2 x half> %x) {
 ; CHECK-V-NEXT:    sgtz a2, a2
 ; CHECK-V-NEXT:  .LBB26_9: # %entry
 ; CHECK-V-NEXT:    neg a2, a2
-; CHECK-V-NEXT:    and a2, a2, a3
 ; CHECK-V-NEXT:    neg a1, a1
+; CHECK-V-NEXT:    and a2, a2, a3
 ; CHECK-V-NEXT:    and a0, a1, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v9, a0
@@ -6050,11 +6050,11 @@ define <2 x i64> @utest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    fmv.d fa0, fs0
 ; CHECK-NOV-NEXT:    call __fixunsdfti
 ; CHECK-NOV-NEXT:    snez a1, a1
+; CHECK-NOV-NEXT:    snez a2, s1
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
+; CHECK-NOV-NEXT:    addi a2, a2, -1
 ; CHECK-NOV-NEXT:    and a0, a1, a0
-; CHECK-NOV-NEXT:    snez a1, s1
-; CHECK-NOV-NEXT:    addi a1, a1, -1
-; CHECK-NOV-NEXT:    and a1, a1, s0
+; CHECK-NOV-NEXT:    and a1, a2, s0
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -6094,13 +6094,13 @@ define <2 x i64> @utest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixunsdfti
 ; CHECK-V-NEXT:    snez a1, a1
+; CHECK-V-NEXT:    snez a2, s1
 ; CHECK-V-NEXT:    addi a1, a1, -1
+; CHECK-V-NEXT:    addi a2, a2, -1
 ; CHECK-V-NEXT:    and a0, a1, a0
-; CHECK-V-NEXT:    snez a1, s1
-; CHECK-V-NEXT:    addi a1, a1, -1
-; CHECK-V-NEXT:    and a1, a1, s0
+; CHECK-V-NEXT:    and a2, a2, s0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v8, a1
+; CHECK-V-NEXT:    vmv.s.x v8, a2
 ; CHECK-V-NEXT:    vmv.s.x v9, a0
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -6152,16 +6152,16 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-NOV-NEXT:    li a3, 1
 ; CHECK-NOV-NEXT:  .LBB47_4: # %entry
 ; CHECK-NOV-NEXT:    slti a1, a1, 1
-; CHECK-NOV-NEXT:    neg a1, a1
-; CHECK-NOV-NEXT:    and a1, a1, a0
-; CHECK-NOV-NEXT:    slti a0, s1, 1
-; CHECK-NOV-NEXT:    neg a0, a0
-; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    slti a4, s1, 1
 ; CHECK-NOV-NEXT:    slti a3, a3, 0
-; CHECK-NOV-NEXT:    addi a3, a3, -1
-; CHECK-NOV-NEXT:    and a0, a3, a0
 ; CHECK-NOV-NEXT:    slti a2, a2, 0
+; CHECK-NOV-NEXT:    neg a1, a1
+; CHECK-NOV-NEXT:    neg a4, a4
+; CHECK-NOV-NEXT:    addi a3, a3, -1
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
+; CHECK-NOV-NEXT:    and a1, a1, a0
+; CHECK-NOV-NEXT:    and a0, a4, s0
+; CHECK-NOV-NEXT:    and a0, a3, a0
 ; CHECK-NOV-NEXT:    and a1, a2, a1
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -6209,20 +6209,20 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
 ; CHECK-V-NEXT:    li a3, 1
 ; CHECK-V-NEXT:  .LBB47_4: # %entry
 ; CHECK-V-NEXT:    slti a1, a1, 1
-; CHECK-V-NEXT:    neg a1, a1
-; CHECK-V-NEXT:    and a0, a1, a0
-; CHECK-V-NEXT:    slti a1, s1, 1
-; CHECK-V-NEXT:    neg a1, a1
-; CHECK-V-NEXT:    and a1, a1, s0
+; CHECK-V-NEXT:    slti a4, s1, 1
 ; CHECK-V-NEXT:    slti a3, a3, 0
-; CHECK-V-NEXT:    addi a3, a3, -1
-; CHECK-V-NEXT:    and a1, a3, a1
 ; CHECK-V-NEXT:    slti a2, a2, 0
+; CHECK-V-NEXT:    neg a1, a1
+; CHECK-V-NEXT:    neg a4, a4
+; CHECK-V-NEXT:    addi a3, a3, -1
 ; CHECK-V-NEXT:    addi a2, a2, -1
+; CHECK-V-NEXT:    and a0, a1, a0
+; CHECK-V-NEXT:    and a4, a4, s0
+; CHECK-V-NEXT:    and a3, a3, a4
 ; CHECK-V-NEXT:    and a0, a2, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    vmv.s.x v9, a1
+; CHECK-V-NEXT:    vmv.s.x v9, a3
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    add sp, sp, a0
@@ -6454,11 +6454,11 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    fmv.s fa0, fs0
 ; CHECK-NOV-NEXT:    call __fixunssfti
 ; CHECK-NOV-NEXT:    snez a1, a1
+; CHECK-NOV-NEXT:    snez a2, s1
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
+; CHECK-NOV-NEXT:    addi a2, a2, -1
 ; CHECK-NOV-NEXT:    and a0, a1, a0
-; CHECK-NOV-NEXT:    snez a1, s1
-; CHECK-NOV-NEXT:    addi a1, a1, -1
-; CHECK-NOV-NEXT:    and a1, a1, s0
+; CHECK-NOV-NEXT:    and a1, a2, s0
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -6498,13 +6498,13 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-V-NEXT:    call __fixunssfti
 ; CHECK-V-NEXT:    snez a1, a1
+; CHECK-V-NEXT:    snez a2, s1
 ; CHECK-V-NEXT:    addi a1, a1, -1
+; CHECK-V-NEXT:    addi a2, a2, -1
 ; CHECK-V-NEXT:    and a0, a1, a0
-; CHECK-V-NEXT:    snez a1, s1
-; CHECK-V-NEXT:    addi a1, a1, -1
-; CHECK-V-NEXT:    and a1, a1, s0
+; CHECK-V-NEXT:    and a2, a2, s0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v8, a1
+; CHECK-V-NEXT:    vmv.s.x v8, a2
 ; CHECK-V-NEXT:    vmv.s.x v9, a0
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
@@ -6556,16 +6556,16 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-NOV-NEXT:    li a3, 1
 ; CHECK-NOV-NEXT:  .LBB50_4: # %entry
 ; CHECK-NOV-NEXT:    slti a1, a1, 1
-; CHECK-NOV-NEXT:    neg a1, a1
-; CHECK-NOV-NEXT:    and a1, a1, a0
-; CHECK-NOV-NEXT:    slti a0, s1, 1
-; CHECK-NOV-NEXT:    neg a0, a0
-; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    slti a4, s1, 1
 ; CHECK-NOV-NEXT:    slti a3, a3, 0
-; CHECK-NOV-NEXT:    addi a3, a3, -1
-; CHECK-NOV-NEXT:    and a0, a3, a0
 ; CHECK-NOV-NEXT:    slti a2, a2, 0
+; CHECK-NOV-NEXT:    neg a1, a1
+; CHECK-NOV-NEXT:    neg a4, a4
+; CHECK-NOV-NEXT:    addi a3, a3, -1
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
+; CHECK-NOV-NEXT:    and a1, a1, a0
+; CHECK-NOV-NEXT:    and a0, a4, s0
+; CHECK-NOV-NEXT:    and a0, a3, a0
 ; CHECK-NOV-NEXT:    and a1, a2, a1
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -6613,20 +6613,20 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
 ; CHECK-V-NEXT:    li a3, 1
 ; CHECK-V-NEXT:  .LBB50_4: # %entry
 ; CHECK-V-NEXT:    slti a1, a1, 1
-; CHECK-V-NEXT:    neg a1, a1
-; CHECK-V-NEXT:    and a0, a1, a0
-; CHECK-V-NEXT:    slti a1, s1, 1
-; CHECK-V-NEXT:    neg a1, a1
-; CHECK-V-NEXT:    and a1, a1, s0
+; CHECK-V-NEXT:    slti a4, s1, 1
 ; CHECK-V-NEXT:    slti a3, a3, 0
-; CHECK-V-NEXT:    addi a3, a3, -1
-; CHECK-V-NEXT:    and a1, a3, a1
 ; CHECK-V-NEXT:    slti a2, a2, 0
+; CHECK-V-NEXT:    neg a1, a1
+; CHECK-V-NEXT:    neg a4, a4
+; CHECK-V-NEXT:    addi a3, a3, -1
 ; CHECK-V-NEXT:    addi a2, a2, -1
+; CHECK-V-NEXT:    and a0, a1, a0
+; CHECK-V-NEXT:    and a4, a4, s0
+; CHECK-V-NEXT:    and a3, a3, a4
 ; CHECK-V-NEXT:    and a0, a2, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
-; CHECK-V-NEXT:    vmv.s.x v9, a1
+; CHECK-V-NEXT:    vmv.s.x v9, a3
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    csrr a0, vlenb
 ; CHECK-V-NEXT:    add sp, sp, a0
@@ -6860,11 +6860,11 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    call __extendhfsf2
 ; CHECK-NOV-NEXT:    call __fixunssfti
 ; CHECK-NOV-NEXT:    snez a1, a1
+; CHECK-NOV-NEXT:    snez a2, s2
 ; CHECK-NOV-NEXT:    addi a1, a1, -1
+; CHECK-NOV-NEXT:    addi a2, a2, -1
 ; CHECK-NOV-NEXT:    and a0, a1, a0
-; CHECK-NOV-NEXT:    snez a1, s2
-; CHECK-NOV-NEXT:    addi a1, a1, -1
-; CHECK-NOV-NEXT:    and a1, a1, s1
+; CHECK-NOV-NEXT:    and a1, a2, s1
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
@@ -6899,13 +6899,13 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) {
 ; CHECK-V-NEXT:    call __extendhfsf2
 ; CHECK-V-NEXT:    call __fixunssfti
 ; CHECK-V-NEXT:    snez a1, a1
+; CHECK-V-NEXT:    snez a2, s2
 ; CHECK-V-NEXT:    addi a1, a1, -1
+; CHECK-V-NEXT:    addi a2, a2, -1
 ; CHECK-V-NEXT:    and a0, a1, a0
-; CHECK-V-NEXT:    snez a1, s2
-; CHECK-V-NEXT:    addi a1, a1, -1
-; CHECK-V-NEXT:    and a1, a1, s1
+; CHECK-V-NEXT:    and a2, a2, s1
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-V-NEXT:    vmv.s.x v9, a1
+; CHECK-V-NEXT:    vmv.s.x v9, a2
 ; CHECK-V-NEXT:    vmv.s.x v8, a0
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -6959,16 +6959,16 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-NOV-NEXT:    li a3, 1
 ; CHECK-NOV-NEXT:  .LBB53_4: # %entry
 ; CHECK-NOV-NEXT:    slti a1, a1, 1
-; CHECK-NOV-NEXT:    neg a1, a1
-; CHECK-NOV-NEXT:    and a1, a1, a0
-; CHECK-NOV-NEXT:    slti a0, s1, 1
-; CHECK-NOV-NEXT:    neg a0, a0
-; CHECK-NOV-NEXT:    and a0, a0, s0
+; CHECK-NOV-NEXT:    slti a4, s1, 1
 ; CHECK-NOV-NEXT:    slti a3, a3, 0
-; CHECK-NOV-NEXT:    addi a3, a3, -1
-; CHECK-NOV-NEXT:    and a0, a3, a0
 ; CHECK-NOV-NEXT:    slti a2, a2, 0
+; CHECK-NOV-NEXT:    neg a1, a1
+; CHECK-NOV-NEXT:    neg a4, a4
+; CHECK-NOV-NEXT:    addi a3, a3, -1
 ; CHECK-NOV-NEXT:    addi a2, a2, -1
+; CHECK-NOV-NEXT:    and a1, a1, a0
+; CHECK-NOV-NEXT:    and a0, a4, s0
+; CHECK-NOV-NEXT:    and a0, a3, a0
 ; CHECK-NOV-NEXT:    and a1, a2, a1
 ; CHECK-NOV-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-NOV-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
@@ -7014,20 +7014,20 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
 ; CHECK-V-NEXT:    li a3, 1
 ; CHECK-V-NEXT:  .LBB53_4: # %entry
 ; CHECK-V-NEXT:    slti a1, a1, 1
-; CHECK-V-NEXT:    neg a1, a1
-; CHECK-V-NEXT:    and a0, a1, a0
-; CHECK-V-NEXT:    slti a1, s1, 1
-; CHECK-V-NEXT:    neg a1, a1
-; CHECK-V-NEXT:    and a1, a1, s0
+; CHECK-V-NEXT:    slti a4, s1, 1
 ; CHECK-V-NEXT:    slti a3, a3, 0
-; CHECK-V-NEXT:    addi a3, a3, -1
-; CHECK-V-NEXT:    and a1, a3, a1
 ; CHECK-V-NEXT:    slti a2, a2, 0
+; CHECK-V-NEXT:    neg a1, a1
+; CHECK-V-NEXT:    neg a4, a4
+; CHECK-V-NEXT:    addi a3, a3, -1
 ; CHECK-V-NEXT:    addi a2, a2, -1
+; CHECK-V-NEXT:    and a0, a1, a0
+; CHECK-V-NEXT:    and a4, a4, s0
+; CHECK-V-NEXT:    and a3, a3, a4
 ; CHECK-V-NEXT:    and a0, a2, a0
 ; CHECK-V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-V-NEXT:    vmv.s.x v9, a0
-; CHECK-V-NEXT:    vmv.s.x v8, a1
+; CHECK-V-NEXT:    vmv.s.x v8, a3
 ; CHECK-V-NEXT:    vslideup.vi v8, v9, 1
 ; CHECK-V-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; CHECK-V-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll
index 3fd37384ada9b2..5fe59f3b3933d0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll
@@ -17,9 +17,9 @@ define <vscale x 1 x bfloat> @rint_nxv1bf16(<vscale x 1 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -38,9 +38,9 @@ define <vscale x 2 x bfloat> @rint_nxv2bf16(<vscale x 2 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -59,9 +59,9 @@ define <vscale x 4 x bfloat> @rint_nxv4bf16(<vscale x 4 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -80,9 +80,9 @@ define <vscale x 8 x bfloat> @rint_nxv8bf16(<vscale x 8 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -101,9 +101,9 @@ define <vscale x 16 x bfloat> @rint_nxv16bf16(<vscale x 16 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -122,9 +122,9 @@ define <vscale x 32 x bfloat> @rint_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
@@ -168,9 +168,9 @@ define <vscale x 1 x half> @rint_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -203,9 +203,9 @@ define <vscale x 2 x half> @rint_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -238,9 +238,9 @@ define <vscale x 4 x half> @rint_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -273,9 +273,9 @@ define <vscale x 8 x half> @rint_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -308,9 +308,9 @@ define <vscale x 16 x half> @rint_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -343,9 +343,9 @@ define <vscale x 32 x half> @rint_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll b/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll
index 54f56eadf00342..195ffc50594c34 100644
--- a/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/frm-insert.ll
@@ -459,12 +459,12 @@ define <vscale x 1 x float> @test5(<vscale x 1 x float> %0, <vscale x 1 x float>
 ; CHECK-NEXT:    fsrmi a2, 0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
+; CHECK-NEXT:    lui a0, 66
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    frrm a0
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    lui a2, 66
-; CHECK-NEXT:    addiw a2, a2, 769
-; CHECK-NEXT:    srl a0, a2, a0
+; CHECK-NEXT:    addiw a0, a0, 769
+; CHECK-NEXT:    frrm a2
+; CHECK-NEXT:    slli a2, a2, 2
+; CHECK-NEXT:    srl a0, a0, a2
 ; CHECK-NEXT:    andi a0, a0, 7
 ; CHECK-NEXT:    vfadd.vv v8, v8, v8
 ; CHECK-NEXT:    sw a0, 0(a1)
@@ -475,12 +475,12 @@ define <vscale x 1 x float> @test5(<vscale x 1 x float> %0, <vscale x 1 x float>
 ; UNOPT-NEXT:    fsrmi a2, 0
 ; UNOPT-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; UNOPT-NEXT:    vfadd.vv v8, v8, v9
+; UNOPT-NEXT:    lui a0, 66
 ; UNOPT-NEXT:    fsrm a2
-; UNOPT-NEXT:    frrm a0
-; UNOPT-NEXT:    slli a0, a0, 2
-; UNOPT-NEXT:    lui a2, 66
-; UNOPT-NEXT:    addiw a2, a2, 769
-; UNOPT-NEXT:    srl a0, a2, a0
+; UNOPT-NEXT:    addiw a0, a0, 769
+; UNOPT-NEXT:    frrm a2
+; UNOPT-NEXT:    slli a2, a2, 2
+; UNOPT-NEXT:    srl a0, a0, a2
 ; UNOPT-NEXT:    andi a0, a0, 7
 ; UNOPT-NEXT:    vfadd.vv v8, v8, v8
 ; UNOPT-NEXT:    sw a0, 0(a1)
@@ -588,8 +588,8 @@ define <vscale x 1 x float> @after_fsrm4(<vscale x 1 x float> %0, <vscale x 1 x
 ; CHECK-LABEL: after_fsrm4:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    slli a0, a0, 32
-; CHECK-NEXT:    srli a0, a0, 30
 ; CHECK-NEXT:    lui a2, 66
+; CHECK-NEXT:    srli a0, a0, 30
 ; CHECK-NEXT:    addiw a2, a2, 769
 ; CHECK-NEXT:    srl a0, a2, a0
 ; CHECK-NEXT:    andi a0, a0, 7
@@ -601,8 +601,8 @@ define <vscale x 1 x float> @after_fsrm4(<vscale x 1 x float> %0, <vscale x 1 x
 ; UNOPT-LABEL: after_fsrm4:
 ; UNOPT:       # %bb.0: # %entry
 ; UNOPT-NEXT:    slli a0, a0, 32
-; UNOPT-NEXT:    srli a0, a0, 30
 ; UNOPT-NEXT:    lui a2, 66
+; UNOPT-NEXT:    srli a0, a0, 30
 ; UNOPT-NEXT:    addiw a2, a2, 769
 ; UNOPT-NEXT:    srl a0, a2, a0
 ; UNOPT-NEXT:    andi a0, a0, 7

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll
index 095f44cfb63e40..3d992aa13e379a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll
@@ -149,10 +149,10 @@ define <vscale x 1 x float> @round_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -172,10 +172,10 @@ define <vscale x 2 x float> @round_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -195,10 +195,10 @@ define <vscale x 4 x float> @round_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -218,10 +218,10 @@ define <vscale x 8 x float> @round_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -241,10 +241,10 @@ define <vscale x 16 x float> @round_nxv16f32(<vscale x 16 x float> %x) strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll
index fd834e9eb5275c..f7422b279149f5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll
@@ -19,9 +19,9 @@ define <vscale x 1 x bfloat> @round_nxv1bf16(<vscale x 1 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
@@ -42,9 +42,9 @@ define <vscale x 2 x bfloat> @round_nxv2bf16(<vscale x 2 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
@@ -65,9 +65,9 @@ define <vscale x 4 x bfloat> @round_nxv4bf16(<vscale x 4 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
@@ -88,9 +88,9 @@ define <vscale x 8 x bfloat> @round_nxv8bf16(<vscale x 8 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
@@ -111,9 +111,9 @@ define <vscale x 16 x bfloat> @round_nxv16bf16(<vscale x 16 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
@@ -134,9 +134,9 @@ define <vscale x 32 x bfloat> @round_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
@@ -146,19 +146,21 @@ define <vscale x 32 x bfloat> @round_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
+; CHECK-NEXT:    vfabs.v v8, v24
+; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
 ; CHECK-NEXT:    ret
   %a = call <vscale x 32 x bfloat> @llvm.round.nxv32bf16(<vscale x 32 x bfloat> %x)
   ret <vscale x 32 x bfloat> %a
@@ -184,9 +186,9 @@ define <vscale x 1 x half> @round_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -223,9 +225,9 @@ define <vscale x 2 x half> @round_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -262,9 +264,9 @@ define <vscale x 4 x half> @round_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -301,9 +303,9 @@ define <vscale x 8 x half> @round_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -340,9 +342,9 @@ define <vscale x 16 x half> @round_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -379,9 +381,9 @@ define <vscale x 32 x half> @round_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -391,19 +393,21 @@ define <vscale x 32 x half> @round_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16
-; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
+; ZVFHMIN-NEXT:    vfabs.v v8, v24
+; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    ret
   %a = call <vscale x 32 x half> @llvm.round.nxv32f16(<vscale x 32 x half> %x)
   ret <vscale x 32 x half> %a

diff  --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll
index 051939d988f857..c293ac91b63bff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll
@@ -149,10 +149,10 @@ define <vscale x 1 x float> @roundeven_nxv1f32(<vscale x 1 x float> %x) strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -172,10 +172,10 @@ define <vscale x 2 x float> @roundeven_nxv2f32(<vscale x 2 x float> %x) strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -195,10 +195,10 @@ define <vscale x 4 x float> @roundeven_nxv4f32(<vscale x 4 x float> %x) strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
@@ -218,10 +218,10 @@ define <vscale x 8 x float> @roundeven_nxv8f32(<vscale x 8 x float> %x) strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
@@ -241,10 +241,10 @@ define <vscale x 16 x float> @roundeven_nxv16f32(<vscale x 16 x float> %x) stric
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll
index 85146588246780..865531b77eb29d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll
@@ -18,9 +18,9 @@ define <vscale x 1 x bfloat> @roundeven_nxv1bf16(<vscale x 1 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
@@ -41,9 +41,9 @@ define <vscale x 2 x bfloat> @roundeven_nxv2bf16(<vscale x 2 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
@@ -64,9 +64,9 @@ define <vscale x 4 x bfloat> @roundeven_nxv4bf16(<vscale x 4 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
@@ -87,9 +87,9 @@ define <vscale x 8 x bfloat> @roundeven_nxv8bf16(<vscale x 8 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
@@ -110,9 +110,9 @@ define <vscale x 16 x bfloat> @roundeven_nxv16bf16(<vscale x 16 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
@@ -133,9 +133,9 @@ define <vscale x 32 x bfloat> @roundeven_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
@@ -145,19 +145,21 @@ define <vscale x 32 x bfloat> @roundeven_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    vmflt.vf v0, v24, fa5
+; CHECK-NEXT:    vfabs.v v8, v24
+; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
 ; CHECK-NEXT:    ret
   %a = call <vscale x 32 x bfloat> @llvm.roundeven.nxv32bf16(<vscale x 32 x bfloat> %x)
   ret <vscale x 32 x bfloat> %a
@@ -183,9 +185,9 @@ define <vscale x 1 x half> @roundeven_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -222,9 +224,9 @@ define <vscale x 2 x half> @roundeven_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -261,9 +263,9 @@ define <vscale x 4 x half> @roundeven_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -300,9 +302,9 @@ define <vscale x 8 x half> @roundeven_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -339,9 +341,9 @@ define <vscale x 16 x half> @roundeven_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -378,9 +380,9 @@ define <vscale x 32 x half> @roundeven_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -390,19 +392,21 @@ define <vscale x 32 x half> @roundeven_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v24, v16
-; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
+; ZVFHMIN-NEXT:    vfabs.v v8, v24
+; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    ret
   %a = call <vscale x 32 x half> @llvm.roundeven.nxv32f16(<vscale x 32 x half> %x)
   ret <vscale x 32 x half> %a

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
index cf35e9c40b8a73..c7e3c8cb519829 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
@@ -216,15 +216,16 @@ define <vscale x 64 x i8> @fshr_v64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8>
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vsll.vi v16, v8, 1, v0.t
 ; CHECK-NEXT:    vnot.v v8, v24, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 7, v0.t
 ; CHECK-NEXT:    vsll.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    vand.vi v16, v24, 7, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsrl.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -249,15 +250,16 @@ define <vscale x 64 x i8> @fshl_v64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8>
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v16, 1, v0.t
 ; CHECK-NEXT:    vnot.v v8, v24, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 7, v0.t
 ; CHECK-NEXT:    vsrl.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    vand.vi v16, v24, 7, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsll.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
@@ -452,15 +454,16 @@ define <vscale x 32 x i16> @fshr_v32i16(<vscale x 32 x i16> %a, <vscale x 32 x i
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vsll.vi v16, v8, 1, v0.t
 ; CHECK-NEXT:    vnot.v v8, v24, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vsll.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    vand.vi v16, v24, 15, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsrl.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -485,15 +488,16 @@ define <vscale x 32 x i16> @fshl_v32i16(<vscale x 32 x i16> %a, <vscale x 32 x i
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8re16.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vsrl.vi v16, v16, 1, v0.t
 ; CHECK-NEXT:    vnot.v v8, v24, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
 ; CHECK-NEXT:    vsrl.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    vand.vi v16, v24, 15, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsll.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
@@ -662,9 +666,9 @@ define <vscale x 16 x i32> @fshr_v16i32(<vscale x 16 x i32> %a, <vscale x 16 x i
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vand.vx v8, v24, a0, v0.t
@@ -697,10 +701,10 @@ define <vscale x 16 x i32> @fshl_v16i32(<vscale x 16 x i32> %a, <vscale x 16 x i
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vl8re32.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    vl8re32.v v24, (a0)
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vand.vx v8, v24, a0, v0.t
@@ -841,9 +845,9 @@ define <vscale x 7 x i64> @fshr_v7i64(<vscale x 7 x i64> %a, <vscale x 7 x i64>
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vand.vx v8, v24, a0, v0.t
@@ -876,10 +880,10 @@ define <vscale x 7 x i64> @fshl_v7i64(<vscale x 7 x i64> %a, <vscale x 7 x i64>
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    vl8re64.v v24, (a0)
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vand.vx v8, v24, a0, v0.t
@@ -912,9 +916,9 @@ define <vscale x 8 x i64> @fshr_v8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64>
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vand.vx v8, v24, a0, v0.t
@@ -947,10 +951,10 @@ define <vscale x 8 x i64> @fshl_v8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64>
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vl8re64.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    vl8re64.v v24, (a0)
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vand.vx v8, v24, a0, v0.t
@@ -998,45 +1002,61 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
-; CHECK-NEXT:    add a5, a0, a3
-; CHECK-NEXT:    add a3, a2, a3
+; CHECK-NEXT:    sub a5, a4, a1
+; CHECK-NEXT:    add a6, a2, a3
+; CHECK-NEXT:    vl8re64.v v8, (a6)
+; CHECK-NEXT:    csrr a6, vlenb
+; CHECK-NEXT:    li a7, 40
+; CHECK-NEXT:    mul a6, a6, a7
+; CHECK-NEXT:    add a6, sp, a6
+; CHECK-NEXT:    addi a6, a6, 16
+; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; CHECK-NEXT:    sltu a6, a4, a5
+; CHECK-NEXT:    addi a6, a6, -1
+; CHECK-NEXT:    and a5, a6, a5
+; CHECK-NEXT:    srli a6, a1, 3
+; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v16, (a3)
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a3, a3, 4
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    sub a3, a4, a1
-; CHECK-NEXT:    sltu a6, a4, a3
-; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a6, a6, a3
-; CHECK-NEXT:    srli a3, a1, 3
-; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a6
+; CHECK-NEXT:    li a3, 63
+; CHECK-NEXT:    csrr a6, vlenb
 ; CHECK-NEXT:    li a7, 40
-; CHECK-NEXT:    mul a5, a5, a7
+; CHECK-NEXT:    mul a6, a6, a7
+; CHECK-NEXT:    add a6, sp, a6
+; CHECK-NEXT:    addi a6, a6, 16
+; CHECK-NEXT:    vl8r.v v8, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a3, v0.t
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a5, a5, 3
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a3
-; CHECK-NEXT:    li a3, 63
-; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; CHECK-NEXT:    vand.vx v8, v16, a3, v0.t
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a6, 40
-; CHECK-NEXT:    mul a5, a5, a6
+; CHECK-NEXT:    slli a5, a5, 4
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    csrr a5, vlenb
 ; CHECK-NEXT:    slli a5, a5, 3
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsrl.vv v16, v16, v8, v0.t
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 4
+; CHECK-NEXT:    slli a5, a5, 3
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    li a6, 40
+; CHECK-NEXT:    mul a5, a5, a6
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
@@ -1168,27 +1188,34 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a5, a3, 3
 ; CHECK-NEXT:    srli a1, a3, 3
-; CHECK-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    sub a6, a4, a3
+; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a1
 ; CHECK-NEXT:    add a1, a2, a5
 ; CHECK-NEXT:    vl8re64.v v8, (a1)
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    sub a1, a4, a3
-; CHECK-NEXT:    sltu a6, a4, a1
-; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a6, a6, a1
+; CHECK-NEXT:    sltu a1, a4, a6
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a6, a1, a6
 ; CHECK-NEXT:    li a1, 63
 ; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    csrr a6, vlenb
+; CHECK-NEXT:    slli a6, a6, 4
+; CHECK-NEXT:    add a6, sp, a6
+; CHECK-NEXT:    addi a6, a6, 16
+; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a6, vlenb
 ; CHECK-NEXT:    slli a6, a6, 5
 ; CHECK-NEXT:    add a6, sp, a6
 ; CHECK-NEXT:    addi a6, a6, 16
 ; CHECK-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a6, vlenb
+; CHECK-NEXT:    slli a6, a6, 4
+; CHECK-NEXT:    add a6, sp, a6
+; CHECK-NEXT:    addi a6, a6, 16
+; CHECK-NEXT:    vl8r.v v8, (a6) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsll.vv v16, v16, v8, v0.t
 ; CHECK-NEXT:    csrr a6, vlenb
 ; CHECK-NEXT:    slli a6, a6, 3
@@ -1196,10 +1223,7 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    addi a6, a6, 16
 ; CHECK-NEXT:    vs8r.v v16, (a6) # Unknown-size Folded Spill
 ; CHECK-NEXT:    add a5, a0, a5
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 4
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
+; CHECK-NEXT:    addi a6, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a6) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vl8re64.v v16, (a5)
@@ -1354,10 +1378,10 @@ define <vscale x 1 x i8> @fshr_v1i4(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v10, 15, v0.t
+; CHECK-NEXT:    li a0, 4
 ; CHECK-NEXT:    vand.vi v9, v9, 15, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    li a0, 4
 ; CHECK-NEXT:    vremu.vx v9, v10, a0, v0.t
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vand.vi v8, v8, 15, v0.t
@@ -1376,10 +1400,10 @@ define <vscale x 1 x i8> @fshl_v1i4(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v10, 15, v0.t
+; CHECK-NEXT:    li a0, 4
 ; CHECK-NEXT:    vand.vi v9, v9, 15, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
-; CHECK-NEXT:    li a0, 4
 ; CHECK-NEXT:    vremu.vx v9, v10, a0, v0.t
 ; CHECK-NEXT:    vsll.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    vsrl.vi v8, v8, 4, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl.ll
index 8452848c467d37..eae21a76f3f00a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl.ll
@@ -7,12 +7,12 @@ define <vscale x 1 x i32> @fshr(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v11, v10, a0
-; CHECK-NEXT:    vsrl.vv v9, v9, v11
-; CHECK-NEXT:    vnot.v v10, v10
-; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vnot.v v11, v10
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
-; CHECK-NEXT:    vsll.vv v8, v8, v10
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vand.vx v11, v11, a0
+; CHECK-NEXT:    vsrl.vv v9, v9, v10
+; CHECK-NEXT:    vsll.vv v8, v8, v11
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %res = call <vscale x 1 x i32> @llvm.fshr.v4i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)
@@ -24,12 +24,12 @@ define <vscale x 1 x i32> @fshl(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v11, v10, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v11
-; CHECK-NEXT:    vnot.v v10, v10
-; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vnot.v v11, v10
 ; CHECK-NEXT:    vsrl.vi v9, v9, 1
-; CHECK-NEXT:    vsrl.vv v9, v9, v10
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vand.vx v11, v11, a0
+; CHECK-NEXT:    vsll.vv v8, v8, v10
+; CHECK-NEXT:    vsrl.vv v9, v9, v11
 ; CHECK-NEXT:    vor.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %res = call <vscale x 1 x i32> @llvm.fshl.v4i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll
index d07bc2c6bf74d7..8a5f118d8f6acc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll
@@ -135,10 +135,10 @@ define <vscale x 1 x float> @trunc_nxv1f32(<vscale x 1 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
@@ -156,10 +156,10 @@ define <vscale x 2 x float> @trunc_nxv2f32(<vscale x 2 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
@@ -177,10 +177,10 @@ define <vscale x 4 x float> @trunc_nxv4f32(<vscale x 4 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v10, v8
 ; CHECK-NEXT:    vmflt.vf v0, v10, fa5
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v10, v8, v0.t
@@ -198,10 +198,10 @@ define <vscale x 8 x float> @trunc_nxv8f32(<vscale x 8 x float> %x) strictfp {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v12, v8
 ; CHECK-NEXT:    vmflt.vf v0, v12, fa5
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v12, v8, v0.t
@@ -219,10 +219,10 @@ define <vscale x 16 x float> @trunc_nxv16f32(<vscale x 16 x float> %x) strictfp
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmfne.vv v0, v8, v8
-; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
-; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    lui a0, 307200
+; CHECK-NEXT:    vfadd.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    vfabs.v v16, v8
 ; CHECK-NEXT:    vmflt.vf v0, v16, fa5
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v16, v8, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll
index 2b3c952679eac6..d597e166be4eed 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll
@@ -17,9 +17,9 @@ define <vscale x 1 x bfloat> @trunc_nxv1bf16(<vscale x 1 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v9, v0.t
@@ -38,9 +38,9 @@ define <vscale x 2 x bfloat> @trunc_nxv2bf16(<vscale x 2 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v9, v0.t
@@ -59,9 +59,9 @@ define <vscale x 4 x bfloat> @trunc_nxv4bf16(<vscale x 4 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v10, v0.t
@@ -80,9 +80,9 @@ define <vscale x 8 x bfloat> @trunc_nxv8bf16(<vscale x 8 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v12, v0.t
@@ -101,9 +101,9 @@ define <vscale x 16 x bfloat> @trunc_nxv16bf16(<vscale x 16 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v16, v0.t
@@ -122,9 +122,9 @@ define <vscale x 32 x bfloat> @trunc_nxv32bf16(<vscale x 32 x bfloat> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
-; CHECK-NEXT:    lui a0, 307200
 ; CHECK-NEXT:    fmv.w.x fa5, a0
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v24, v16, v0.t
@@ -168,9 +168,9 @@ define <vscale x 1 x half> @trunc_nxv1f16(<vscale x 1 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v9, v0.t
@@ -203,9 +203,9 @@ define <vscale x 2 x half> @trunc_nxv2f16(<vscale x 2 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v9, v0.t
@@ -238,9 +238,9 @@ define <vscale x 4 x half> @trunc_nxv4f16(<vscale x 4 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v10, v0.t
@@ -273,9 +273,9 @@ define <vscale x 8 x half> @trunc_nxv8f16(<vscale x 8 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v12, v0.t
@@ -308,9 +308,9 @@ define <vscale x 16 x half> @trunc_nxv16f16(<vscale x 16 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v16, v0.t
@@ -343,9 +343,9 @@ define <vscale x 32 x half> @trunc_nxv32f16(<vscale x 32 x half> %x) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
 ; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v24, v16, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
index e2298774a9b8d4..8925a9e0cee321 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -419,9 +419,9 @@ define <vscale x 4 x i1> @insert_nxv4i1_nxv1i1_0(<vscale x 4 x i1> %v, <vscale x
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
 ; CHECK-NEXT:    vmv.v.v v9, v8
@@ -437,13 +437,13 @@ define <vscale x 4 x i1> @insert_nxv4i1_nxv1i1_2(<vscale x 4 x i1> %v, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    vsetvli a2, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/insertelt-i1.ll b/llvm/test/CodeGen/RISCV/rvv/insertelt-i1.ll
index 911072d9571ff8..7f57f4fd22ff50 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insertelt-i1.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insertelt-i1.ll
@@ -24,8 +24,8 @@ define <vscale x 1 x i1> @insertelt_idx_nxv1i1(<vscale x 1 x i1> %x, i1 %elt, i6
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    addi a2, a1, 1
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a2, e8, mf8, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
@@ -59,8 +59,8 @@ define <vscale x 2 x i1> @insertelt_idx_nxv2i1(<vscale x 2 x i1> %x, i1 %elt, i6
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    addi a2, a1, 1
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a2, e8, mf4, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
@@ -94,8 +94,8 @@ define <vscale x 4 x i1> @insertelt_idx_nxv4i1(<vscale x 4 x i1> %x, i1 %elt, i6
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    addi a2, a1, 1
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
@@ -129,8 +129,8 @@ define <vscale x 8 x i1> @insertelt_idx_nxv8i1(<vscale x 8 x i1> %x, i1 %elt, i6
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    addi a2, a1, 1
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vmv.s.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m1, tu, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll b/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll
index e9c78ed08f72af..7e2ec46339b332 100644
--- a/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/interleave-crash.ll
@@ -9,8 +9,8 @@ define void @interleave256(ptr %agg.result, ptr %0, ptr %1) {
 ; RV64-1024-NEXT:    vsetvli zero, a3, e16, m2, ta, ma
 ; RV64-1024-NEXT:    vle16.v v8, (a1)
 ; RV64-1024-NEXT:    vle16.v v10, (a2)
-; RV64-1024-NEXT:    vwaddu.vv v12, v8, v10
 ; RV64-1024-NEXT:    li a1, -1
+; RV64-1024-NEXT:    vwaddu.vv v12, v8, v10
 ; RV64-1024-NEXT:    vwmaccu.vx v12, a1, v10
 ; RV64-1024-NEXT:    li a1, 256
 ; RV64-1024-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
@@ -23,8 +23,8 @@ define void @interleave256(ptr %agg.result, ptr %0, ptr %1) {
 ; RV64-2048-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
 ; RV64-2048-NEXT:    vle16.v v8, (a1)
 ; RV64-2048-NEXT:    vle16.v v9, (a2)
-; RV64-2048-NEXT:    vwaddu.vv v10, v8, v9
 ; RV64-2048-NEXT:    li a1, -1
+; RV64-2048-NEXT:    vwaddu.vv v10, v8, v9
 ; RV64-2048-NEXT:    vwmaccu.vx v10, a1, v9
 ; RV64-2048-NEXT:    li a1, 256
 ; RV64-2048-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
@@ -47,8 +47,8 @@ define void @interleave512(ptr %agg.result, ptr %0, ptr %1) local_unnamed_addr {
 ; RV64-1024-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; RV64-1024-NEXT:    vle16.v v8, (a1)
 ; RV64-1024-NEXT:    vle16.v v12, (a2)
-; RV64-1024-NEXT:    vwaddu.vv v16, v8, v12
 ; RV64-1024-NEXT:    li a1, -1
+; RV64-1024-NEXT:    vwaddu.vv v16, v8, v12
 ; RV64-1024-NEXT:    vwmaccu.vx v16, a1, v12
 ; RV64-1024-NEXT:    li a1, 512
 ; RV64-1024-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
@@ -61,8 +61,8 @@ define void @interleave512(ptr %agg.result, ptr %0, ptr %1) local_unnamed_addr {
 ; RV64-2048-NEXT:    vsetvli zero, a3, e16, m2, ta, ma
 ; RV64-2048-NEXT:    vle16.v v8, (a1)
 ; RV64-2048-NEXT:    vle16.v v10, (a2)
-; RV64-2048-NEXT:    vwaddu.vv v12, v8, v10
 ; RV64-2048-NEXT:    li a1, -1
+; RV64-2048-NEXT:    vwaddu.vv v12, v8, v10
 ; RV64-2048-NEXT:    vwmaccu.vx v12, a1, v10
 ; RV64-2048-NEXT:    li a1, 512
 ; RV64-2048-NEXT:    vsetvli zero, a1, e16, m4, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
index e34b4a81b631bd..ffb9bf76fb4fab 100644
--- a/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/llrint-vp.ll
@@ -58,12 +58,12 @@ define <vscale x 16 x i64> @llrint_nxv16i64_nxv16f32(<vscale x 16 x float> %x, <
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-NEXT:    vfwcvt.x.f.v v16, v12, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB4_2

diff  --git a/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
index c9f91bf9def2ce..9991bbc9725ba3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/lrint-vp.ll
@@ -120,12 +120,12 @@ define <vscale x 16 x iXLen> @lrint_nxv16f32(<vscale x 16 x float> %x, <vscale x
 ; RV64-i64-NEXT:    vmv1r.v v24, v0
 ; RV64-i64-NEXT:    csrr a1, vlenb
 ; RV64-i64-NEXT:    srli a2, a1, 3
-; RV64-i64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; RV64-i64-NEXT:    sub a3, a0, a1
+; RV64-i64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV64-i64-NEXT:    vslidedown.vx v0, v0, a2
-; RV64-i64-NEXT:    sub a2, a0, a1
-; RV64-i64-NEXT:    sltu a3, a0, a2
-; RV64-i64-NEXT:    addi a3, a3, -1
-; RV64-i64-NEXT:    and a2, a3, a2
+; RV64-i64-NEXT:    sltu a2, a0, a3
+; RV64-i64-NEXT:    addi a2, a2, -1
+; RV64-i64-NEXT:    and a2, a2, a3
 ; RV64-i64-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; RV64-i64-NEXT:    vfwcvt.x.f.v v16, v12, v0.t
 ; RV64-i64-NEXT:    bltu a0, a1, .LBB4_2

diff  --git a/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll b/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll
index 53598c609107b0..2553f563b7d0fd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll
@@ -218,8 +218,8 @@ define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-NEXT:    sb a2, 14(a0)
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vle8.v v8, (a1)
-; RV32-NEXT:    vse8.v v8, (a0)
 ; RV32-NEXT:    addi a2, a1, 12
+; RV32-NEXT:    vse8.v v8, (a0)
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; RV32-NEXT:    vle8.v v8, (a2)
 ; RV32-NEXT:    addi a2, a0, 12
@@ -237,8 +237,8 @@ define void @unaligned_memcpy15(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-NEXT:    sb a2, 14(a0)
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vle8.v v8, (a1)
-; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    addi a2, a1, 12
+; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; RV64-NEXT:    vle8.v v8, (a2)
 ; RV64-NEXT:    addi a2, a0, 12
@@ -431,8 +431,8 @@ define void @unaligned_memcpy96(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-NEXT:    li a2, 64
 ; RV32-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
 ; RV32-NEXT:    vle8.v v8, (a1)
-; RV32-NEXT:    vse8.v v8, (a0)
 ; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    vse8.v v8, (a0)
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; RV32-NEXT:    vle8.v v8, (a1)
@@ -445,8 +445,8 @@ define void @unaligned_memcpy96(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-NEXT:    li a2, 64
 ; RV64-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
 ; RV64-NEXT:    vle8.v v8, (a1)
-; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    li a2, 32
 ; RV64-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
 ; RV64-NEXT:    vle8.v v8, (a1)
@@ -523,8 +523,8 @@ define void @unaligned_memcpy196(ptr nocapture %dest, ptr %src) nounwind {
 ; RV32-NEXT:    li a2, 128
 ; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; RV32-NEXT:    vle8.v v8, (a1)
-; RV32-NEXT:    vse8.v v8, (a0)
 ; RV32-NEXT:    addi a2, a1, 128
+; RV32-NEXT:    vse8.v v8, (a0)
 ; RV32-NEXT:    li a3, 64
 ; RV32-NEXT:    vsetvli zero, a3, e8, m4, ta, ma
 ; RV32-NEXT:    vle8.v v8, (a2)
@@ -545,8 +545,8 @@ define void @unaligned_memcpy196(ptr nocapture %dest, ptr %src) nounwind {
 ; RV64-NEXT:    li a2, 128
 ; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; RV64-NEXT:    vle8.v v8, (a1)
-; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    addi a2, a1, 128
+; RV64-NEXT:    vse8.v v8, (a0)
 ; RV64-NEXT:    li a3, 64
 ; RV64-NEXT:    vsetvli zero, a3, e8, m4, ta, ma
 ; RV64-NEXT:    vle8.v v8, (a2)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll b/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll
index 5349878b5d1117..6c68051ff38c3f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll
@@ -194,11 +194,11 @@ define void @memset_64(ptr %a, i8 %value) nounwind {
 ; RV32-BOTH-NEXT:    addi a2, a0, 48
 ; RV32-BOTH-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV32-BOTH-NEXT:    vmv.v.x v8, a1
-; RV32-BOTH-NEXT:    vse8.v v8, (a2)
 ; RV32-BOTH-NEXT:    addi a1, a0, 32
+; RV32-BOTH-NEXT:    vse8.v v8, (a2)
+; RV32-BOTH-NEXT:    addi a2, a0, 16
 ; RV32-BOTH-NEXT:    vse8.v v8, (a1)
-; RV32-BOTH-NEXT:    addi a1, a0, 16
-; RV32-BOTH-NEXT:    vse8.v v8, (a1)
+; RV32-BOTH-NEXT:    vse8.v v8, (a2)
 ; RV32-BOTH-NEXT:    vse8.v v8, (a0)
 ; RV32-BOTH-NEXT:    ret
 ;
@@ -207,11 +207,11 @@ define void @memset_64(ptr %a, i8 %value) nounwind {
 ; RV64-BOTH-NEXT:    addi a2, a0, 48
 ; RV64-BOTH-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64-BOTH-NEXT:    vmv.v.x v8, a1
-; RV64-BOTH-NEXT:    vse8.v v8, (a2)
 ; RV64-BOTH-NEXT:    addi a1, a0, 32
+; RV64-BOTH-NEXT:    vse8.v v8, (a2)
+; RV64-BOTH-NEXT:    addi a2, a0, 16
 ; RV64-BOTH-NEXT:    vse8.v v8, (a1)
-; RV64-BOTH-NEXT:    addi a1, a0, 16
-; RV64-BOTH-NEXT:    vse8.v v8, (a1)
+; RV64-BOTH-NEXT:    vse8.v v8, (a2)
 ; RV64-BOTH-NEXT:    vse8.v v8, (a0)
 ; RV64-BOTH-NEXT:    ret
   tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 64, i1 0)
@@ -334,11 +334,11 @@ define void @aligned_memset_64(ptr align 64 %a, i8 %value) nounwind {
 ; RV32-BOTH-NEXT:    addi a2, a0, 48
 ; RV32-BOTH-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV32-BOTH-NEXT:    vmv.v.x v8, a1
-; RV32-BOTH-NEXT:    vse8.v v8, (a2)
 ; RV32-BOTH-NEXT:    addi a1, a0, 32
+; RV32-BOTH-NEXT:    vse8.v v8, (a2)
+; RV32-BOTH-NEXT:    addi a2, a0, 16
 ; RV32-BOTH-NEXT:    vse8.v v8, (a1)
-; RV32-BOTH-NEXT:    addi a1, a0, 16
-; RV32-BOTH-NEXT:    vse8.v v8, (a1)
+; RV32-BOTH-NEXT:    vse8.v v8, (a2)
 ; RV32-BOTH-NEXT:    vse8.v v8, (a0)
 ; RV32-BOTH-NEXT:    ret
 ;
@@ -347,11 +347,11 @@ define void @aligned_memset_64(ptr align 64 %a, i8 %value) nounwind {
 ; RV64-BOTH-NEXT:    addi a2, a0, 48
 ; RV64-BOTH-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64-BOTH-NEXT:    vmv.v.x v8, a1
-; RV64-BOTH-NEXT:    vse8.v v8, (a2)
 ; RV64-BOTH-NEXT:    addi a1, a0, 32
+; RV64-BOTH-NEXT:    vse8.v v8, (a2)
+; RV64-BOTH-NEXT:    addi a2, a0, 16
 ; RV64-BOTH-NEXT:    vse8.v v8, (a1)
-; RV64-BOTH-NEXT:    addi a1, a0, 16
-; RV64-BOTH-NEXT:    vse8.v v8, (a1)
+; RV64-BOTH-NEXT:    vse8.v v8, (a2)
 ; RV64-BOTH-NEXT:    vse8.v v8, (a0)
 ; RV64-BOTH-NEXT:    ret
   tail call void @llvm.memset.inline.p0.i64(ptr align 64 %a, i8 %value, i64 64, i1 0)
@@ -704,12 +704,12 @@ define void @aligned_bzero_96(ptr %a) nounwind {
 ; RV32-BOTH:       # %bb.0:
 ; RV32-BOTH-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-BOTH-NEXT:    vmv.v.i v8, 0
-; RV32-BOTH-NEXT:    vse64.v v8, (a0)
 ; RV32-BOTH-NEXT:    addi a1, a0, 80
+; RV32-BOTH-NEXT:    vse64.v v8, (a0)
 ; RV32-BOTH-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32-BOTH-NEXT:    vmv.v.i v8, 0
-; RV32-BOTH-NEXT:    vse64.v v8, (a1)
 ; RV32-BOTH-NEXT:    addi a0, a0, 64
+; RV32-BOTH-NEXT:    vse64.v v8, (a1)
 ; RV32-BOTH-NEXT:    vse64.v v8, (a0)
 ; RV32-BOTH-NEXT:    ret
 ;
@@ -717,12 +717,12 @@ define void @aligned_bzero_96(ptr %a) nounwind {
 ; RV64-BOTH:       # %bb.0:
 ; RV64-BOTH-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-BOTH-NEXT:    vmv.v.i v8, 0
-; RV64-BOTH-NEXT:    vse64.v v8, (a0)
 ; RV64-BOTH-NEXT:    addi a1, a0, 80
+; RV64-BOTH-NEXT:    vse64.v v8, (a0)
 ; RV64-BOTH-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64-BOTH-NEXT:    vmv.v.i v8, 0
-; RV64-BOTH-NEXT:    vse64.v v8, (a1)
 ; RV64-BOTH-NEXT:    addi a0, a0, 64
+; RV64-BOTH-NEXT:    vse64.v v8, (a1)
 ; RV64-BOTH-NEXT:    vse64.v v8, (a0)
 ; RV64-BOTH-NEXT:    ret
   tail call void @llvm.memset.inline.p0.i64(ptr align 64 %a, i8 0, i64 96, i1 0)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
index 23353a2de386ae..a3eddbcc2baed4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
@@ -2320,9 +2320,9 @@ define <vscale x 32 x i8> @mgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8>
 ; RV64-NEXT:    vmv1r.v v16, v0
 ; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf8 v24, v8
+; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; RV64-NEXT:    vluxei64.v v12, (a0), v24, v0.t
-; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    srli a2, a1, 3
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a2

diff  --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll
index 9ae470c7898963..ac26a014aaa643 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-combine.ll
@@ -12,8 +12,8 @@ define void @complex_gep(ptr %p, <vscale x 2 x i64> %vec.ind, <vscale x 2 x i1>
 ; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vnsrl.wi v10, v8, 0
 ; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    vmul.vx v8, v10, a1
 ; RV32-NEXT:    addi a0, a0, 28
+; RV32-NEXT:    vmul.vx v8, v10, a1
 ; RV32-NEXT:    vmv.v.i v9, 0
 ; RV32-NEXT:    vsoxei32.v v9, (a0), v8, v0.t
 ; RV32-NEXT:    ret
@@ -21,9 +21,9 @@ define void @complex_gep(ptr %p, <vscale x 2 x i64> %vec.ind, <vscale x 2 x i1>
 ; RV64-LABEL: complex_gep:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    li a1, 56
+; RV64-NEXT:    addi a0, a0, 32
 ; RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
 ; RV64-NEXT:    vmul.vx v8, v8, a1
-; RV64-NEXT:    addi a0, a0, 32
 ; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.i v10, 0
 ; RV64-NEXT:    vsoxei64.v v10, (a0), v8, v0.t
@@ -38,14 +38,14 @@ define void @strided_store_zero_start(i64 %n, ptr %p) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vid.v v8
+; RV32-NEXT:    li a0, 48
+; RV32-NEXT:    addi a1, a2, 32
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    vnsrl.wi v8, v8, 0
-; RV32-NEXT:    li a0, 48
 ; RV32-NEXT:    vmul.vx v8, v8, a0
-; RV32-NEXT:    addi a0, a2, 32
 ; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 0
-; RV32-NEXT:    vsoxei32.v v9, (a0), v8
+; RV32-NEXT:    vsoxei32.v v9, (a1), v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: strided_store_zero_start:
@@ -68,25 +68,25 @@ define void @strided_store_offset_start(i64 %n, ptr %p) {
 ; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vid.v v8
 ; RV32-NEXT:    vadd.vx v8, v8, a0
+; RV32-NEXT:    li a0, 48
+; RV32-NEXT:    addi a1, a2, 32
 ; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    vnsrl.wi v8, v8, 0
-; RV32-NEXT:    li a0, 48
 ; RV32-NEXT:    vmul.vx v8, v8, a0
-; RV32-NEXT:    addi a0, a2, 32
 ; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.v.i v9, 0
-; RV32-NEXT:    vsoxei32.v v9, (a0), v8
+; RV32-NEXT:    vsoxei32.v v9, (a1), v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: strided_store_offset_start:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    slli a2, a0, 3
 ; RV64-NEXT:    slli a0, a0, 6
+; RV64-NEXT:    vsetvli a3, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.i v8, 0
 ; RV64-NEXT:    sub a0, a0, a2
 ; RV64-NEXT:    add a0, a1, a0
 ; RV64-NEXT:    addi a0, a0, 36
-; RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; RV64-NEXT:    vmv.v.i v8, 0
 ; RV64-NEXT:    li a1, 56
 ; RV64-NEXT:    vsse64.v v8, (a0), a1
 ; RV64-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
index c7c7dbafd630a1..72c251ce985cbf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
@@ -1894,36 +1894,57 @@ define void @mscatter_nxv16f64(<vscale x 8 x double> %val0, <vscale x 8 x double
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    slli a2, a2, 5
 ; RV64-NEXT:    sub sp, sp, a2
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv8r.v v16, v8
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    vl8re64.v v8, (a1)
 ; RV64-NEXT:    addi a1, sp, 16
 ; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vl8re64.v v8, (a0)
-; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    srli a0, a0, 3
 ; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v24, v0, a0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v16, (zero), v8, v0.t
 ; RV64-NEXT:    vmv1r.v v0, v24
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    vsoxei64.v v8, (zero), v16, v0.t
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsoxei64.v v16, (zero), v8, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    slli a0, a0, 5
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
 ; RV64-NEXT:    addi sp, sp, 16
@@ -1940,14 +1961,14 @@ define void @mscatter_nxv16f64(<vscale x 8 x double> %val0, <vscale x 8 x double
 define void @mscatter_baseidx_nxv16i8_nxv16f64(<vscale x 8 x double> %val0, <vscale x 8 x double> %val1, ptr %base, <vscale x 16 x i8> %idxs, <vscale x 16 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_nxv16i8_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vl2r.v v6, (a1)
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vsext.vf4 v24, v6
-; RV32-NEXT:    vsll.vi v24, v24, 3
+; RV32-NEXT:    vl2r.v v4, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    srli a1, a1, 3
 ; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v7, v0, a1
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vsext.vf4 v24, v4
+; RV32-NEXT:    vsll.vi v24, v24, 3
 ; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v7
@@ -1961,12 +1982,12 @@ define void @mscatter_baseidx_nxv16i8_nxv16f64(<vscale x 8 x double> %val0, <vsc
 ; RV64-NEXT:    vsext.vf8 v24, v6
 ; RV64-NEXT:    vsll.vi v24, v24, 3
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
-; RV64-NEXT:    vsext.vf8 v8, v7
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    srli a1, a1, 3
 ; RV64-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf8 v8, v7
 ; RV64-NEXT:    vsll.vi v8, v8, 3
 ; RV64-NEXT:    vsoxei64.v v16, (a0), v8, v0.t
 ; RV64-NEXT:    ret
@@ -1980,18 +2001,35 @@ define void @mscatter_baseidx_nxv16i8_nxv16f64(<vscale x 8 x double> %val0, <vsc
 define void @mscatter_baseidx_nxv16i16_nxv16f64(<vscale x 8 x double> %val0, <vscale x 8 x double> %val1, ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vl4re16.v v4, (a1)
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vsext.vf2 v24, v4
-; RV32-NEXT:    vsll.vi v24, v24, 3
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    sub sp, sp, a2
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv8r.v v16, v8
+; RV32-NEXT:    vl4re16.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    srli a1, a1, 3
 ; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v7, v0, a1
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vsext.vf2 v24, v8
+; RV32-NEXT:    vsll.vi v8, v24, 3
 ; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
+; RV32-NEXT:    vsoxei32.v v16, (a0), v8, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    vsoxei32.v v16, (a0), v28, v0.t
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsoxei32.v v16, (a0), v12, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    .cfi_def_cfa sp, 16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mscatter_baseidx_nxv16i16_nxv16f64:
@@ -2001,12 +2039,12 @@ define void @mscatter_baseidx_nxv16i16_nxv16f64(<vscale x 8 x double> %val0, <vs
 ; RV64-NEXT:    vsext.vf4 v24, v4
 ; RV64-NEXT:    vsll.vi v24, v24, 3
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
-; RV64-NEXT:    vsext.vf4 v8, v6
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    srli a1, a1, 3
 ; RV64-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsext.vf4 v8, v6
 ; RV64-NEXT:    vsll.vi v8, v8, 3
 ; RV64-NEXT:    vsoxei64.v v16, (a0), v8, v0.t
 ; RV64-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/mutate-prior-vsetvli-avl.ll b/llvm/test/CodeGen/RISCV/rvv/mutate-prior-vsetvli-avl.ll
index da56e235ecd8ed..dc34a49e56aa57 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mutate-prior-vsetvli-avl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mutate-prior-vsetvli-avl.ll
@@ -11,15 +11,15 @@ define dso_local void @test(ptr nocapture noundef %var_99) {
 ; CHECK-NEXT:    addi a1, a1, %lo(.L__const.test.var_45)
 ; CHECK-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a1)
-; CHECK-NEXT:    li a1, 1
-; CHECK-NEXT:    vmul.vx v12, v8, a1
 ; CHECK-NEXT:    lui a1, %hi(.L__const.test.var_101)
 ; CHECK-NEXT:    addi a1, a1, %lo(.L__const.test.var_101)
-; CHECK-NEXT:    vle8.v v16, (a1)
-; CHECK-NEXT:    vmv.x.s a1, v12
+; CHECK-NEXT:    vle8.v v12, (a1)
+; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    csrwi vxrm, 0
+; CHECK-NEXT:    vmul.vx v16, v8, a1
+; CHECK-NEXT:    vmv.x.s a1, v16
 ; CHECK-NEXT:    vmsleu.vx v0, v8, a1
-; CHECK-NEXT:    vssra.vv v8, v16, v8
+; CHECK-NEXT:    vssra.vv v8, v12, v8
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v8, v0
 ; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
index 8da064529f1b3d..c49a55319a3c41 100644
--- a/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/named-vector-shuffle-reverse.ll
@@ -21,8 +21,8 @@ define <vscale x 2 x i1> @reverse_nxv2i1(<vscale x 2 x i1> %a) {
 ; RV32-BITS-UNKNOWN:       # %bb.0:
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; RV32-BITS-UNKNOWN-NEXT:    vmv.v.i v8, 0
-; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-UNKNOWN-NEXT:    csrr a0, vlenb
+; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-UNKNOWN-NEXT:    srli a0, a0, 2
 ; RV32-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
@@ -37,8 +37,8 @@ define <vscale x 2 x i1> @reverse_nxv2i1(<vscale x 2 x i1> %a) {
 ; RV32-BITS-256:       # %bb.0:
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; RV32-BITS-256-NEXT:    vmv.v.i v8, 0
-; RV32-BITS-256-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-256-NEXT:    csrr a0, vlenb
+; RV32-BITS-256-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-256-NEXT:    srli a0, a0, 2
 ; RV32-BITS-256-NEXT:    addi a0, a0, -1
 ; RV32-BITS-256-NEXT:    vid.v v9
@@ -51,8 +51,8 @@ define <vscale x 2 x i1> @reverse_nxv2i1(<vscale x 2 x i1> %a) {
 ; RV32-BITS-512:       # %bb.0:
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; RV32-BITS-512-NEXT:    vmv.v.i v8, 0
-; RV32-BITS-512-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-512-NEXT:    csrr a0, vlenb
+; RV32-BITS-512-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-512-NEXT:    srli a0, a0, 2
 ; RV32-BITS-512-NEXT:    addi a0, a0, -1
 ; RV32-BITS-512-NEXT:    vid.v v9
@@ -65,8 +65,8 @@ define <vscale x 2 x i1> @reverse_nxv2i1(<vscale x 2 x i1> %a) {
 ; RV64-BITS-UNKNOWN:       # %bb.0:
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; RV64-BITS-UNKNOWN-NEXT:    vmv.v.i v8, 0
-; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-UNKNOWN-NEXT:    csrr a0, vlenb
+; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-UNKNOWN-NEXT:    srli a0, a0, 2
 ; RV64-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
@@ -81,8 +81,8 @@ define <vscale x 2 x i1> @reverse_nxv2i1(<vscale x 2 x i1> %a) {
 ; RV64-BITS-256:       # %bb.0:
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; RV64-BITS-256-NEXT:    vmv.v.i v8, 0
-; RV64-BITS-256-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-256-NEXT:    csrr a0, vlenb
+; RV64-BITS-256-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-256-NEXT:    srli a0, a0, 2
 ; RV64-BITS-256-NEXT:    addi a0, a0, -1
 ; RV64-BITS-256-NEXT:    vid.v v9
@@ -95,8 +95,8 @@ define <vscale x 2 x i1> @reverse_nxv2i1(<vscale x 2 x i1> %a) {
 ; RV64-BITS-512:       # %bb.0:
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; RV64-BITS-512-NEXT:    vmv.v.i v8, 0
-; RV64-BITS-512-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-512-NEXT:    csrr a0, vlenb
+; RV64-BITS-512-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-512-NEXT:    srli a0, a0, 2
 ; RV64-BITS-512-NEXT:    addi a0, a0, -1
 ; RV64-BITS-512-NEXT:    vid.v v9
@@ -113,8 +113,8 @@ define <vscale x 4 x i1> @reverse_nxv4i1(<vscale x 4 x i1> %a) {
 ; RV32-BITS-UNKNOWN:       # %bb.0:
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; RV32-BITS-UNKNOWN-NEXT:    vmv.v.i v8, 0
-; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-UNKNOWN-NEXT:    csrr a0, vlenb
+; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-UNKNOWN-NEXT:    srli a0, a0, 1
 ; RV32-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -129,8 +129,8 @@ define <vscale x 4 x i1> @reverse_nxv4i1(<vscale x 4 x i1> %a) {
 ; RV32-BITS-256:       # %bb.0:
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; RV32-BITS-256-NEXT:    vmv.v.i v8, 0
-; RV32-BITS-256-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-256-NEXT:    csrr a0, vlenb
+; RV32-BITS-256-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-256-NEXT:    srli a0, a0, 1
 ; RV32-BITS-256-NEXT:    addi a0, a0, -1
 ; RV32-BITS-256-NEXT:    vid.v v9
@@ -143,8 +143,8 @@ define <vscale x 4 x i1> @reverse_nxv4i1(<vscale x 4 x i1> %a) {
 ; RV32-BITS-512:       # %bb.0:
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; RV32-BITS-512-NEXT:    vmv.v.i v8, 0
-; RV32-BITS-512-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-512-NEXT:    csrr a0, vlenb
+; RV32-BITS-512-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-512-NEXT:    srli a0, a0, 1
 ; RV32-BITS-512-NEXT:    addi a0, a0, -1
 ; RV32-BITS-512-NEXT:    vid.v v9
@@ -157,8 +157,8 @@ define <vscale x 4 x i1> @reverse_nxv4i1(<vscale x 4 x i1> %a) {
 ; RV64-BITS-UNKNOWN:       # %bb.0:
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; RV64-BITS-UNKNOWN-NEXT:    vmv.v.i v8, 0
-; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-UNKNOWN-NEXT:    csrr a0, vlenb
+; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-UNKNOWN-NEXT:    srli a0, a0, 1
 ; RV64-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
@@ -173,8 +173,8 @@ define <vscale x 4 x i1> @reverse_nxv4i1(<vscale x 4 x i1> %a) {
 ; RV64-BITS-256:       # %bb.0:
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; RV64-BITS-256-NEXT:    vmv.v.i v8, 0
-; RV64-BITS-256-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-256-NEXT:    csrr a0, vlenb
+; RV64-BITS-256-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-256-NEXT:    srli a0, a0, 1
 ; RV64-BITS-256-NEXT:    addi a0, a0, -1
 ; RV64-BITS-256-NEXT:    vid.v v9
@@ -187,8 +187,8 @@ define <vscale x 4 x i1> @reverse_nxv4i1(<vscale x 4 x i1> %a) {
 ; RV64-BITS-512:       # %bb.0:
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; RV64-BITS-512-NEXT:    vmv.v.i v8, 0
-; RV64-BITS-512-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-512-NEXT:    csrr a0, vlenb
+; RV64-BITS-512-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-512-NEXT:    srli a0, a0, 1
 ; RV64-BITS-512-NEXT:    addi a0, a0, -1
 ; RV64-BITS-512-NEXT:    vid.v v9
@@ -205,8 +205,8 @@ define <vscale x 8 x i1> @reverse_nxv8i1(<vscale x 8 x i1> %a) {
 ; RV32-BITS-UNKNOWN:       # %bb.0:
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV32-BITS-UNKNOWN-NEXT:    vmv.v.i v8, 0
-; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-UNKNOWN-NEXT:    csrr a0, vlenb
+; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV32-BITS-UNKNOWN-NEXT:    vid.v v10
@@ -220,8 +220,8 @@ define <vscale x 8 x i1> @reverse_nxv8i1(<vscale x 8 x i1> %a) {
 ; RV32-BITS-256:       # %bb.0:
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV32-BITS-256-NEXT:    vmv.v.i v8, 0
-; RV32-BITS-256-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-256-NEXT:    csrr a0, vlenb
+; RV32-BITS-256-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-256-NEXT:    addi a0, a0, -1
 ; RV32-BITS-256-NEXT:    vid.v v9
 ; RV32-BITS-256-NEXT:    vrsub.vx v9, v9, a0
@@ -233,8 +233,8 @@ define <vscale x 8 x i1> @reverse_nxv8i1(<vscale x 8 x i1> %a) {
 ; RV32-BITS-512:       # %bb.0:
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV32-BITS-512-NEXT:    vmv.v.i v8, 0
-; RV32-BITS-512-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-512-NEXT:    csrr a0, vlenb
+; RV32-BITS-512-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV32-BITS-512-NEXT:    addi a0, a0, -1
 ; RV32-BITS-512-NEXT:    vid.v v9
 ; RV32-BITS-512-NEXT:    vrsub.vx v9, v9, a0
@@ -246,8 +246,8 @@ define <vscale x 8 x i1> @reverse_nxv8i1(<vscale x 8 x i1> %a) {
 ; RV64-BITS-UNKNOWN:       # %bb.0:
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV64-BITS-UNKNOWN-NEXT:    vmv.v.i v8, 0
-; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-UNKNOWN-NEXT:    csrr a0, vlenb
+; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64-BITS-UNKNOWN-NEXT:    vid.v v10
@@ -261,8 +261,8 @@ define <vscale x 8 x i1> @reverse_nxv8i1(<vscale x 8 x i1> %a) {
 ; RV64-BITS-256:       # %bb.0:
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV64-BITS-256-NEXT:    vmv.v.i v8, 0
-; RV64-BITS-256-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-256-NEXT:    csrr a0, vlenb
+; RV64-BITS-256-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-256-NEXT:    addi a0, a0, -1
 ; RV64-BITS-256-NEXT:    vid.v v9
 ; RV64-BITS-256-NEXT:    vrsub.vx v9, v9, a0
@@ -274,8 +274,8 @@ define <vscale x 8 x i1> @reverse_nxv8i1(<vscale x 8 x i1> %a) {
 ; RV64-BITS-512:       # %bb.0:
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV64-BITS-512-NEXT:    vmv.v.i v8, 0
-; RV64-BITS-512-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-512-NEXT:    csrr a0, vlenb
+; RV64-BITS-512-NEXT:    vmerge.vim v8, v8, 1, v0
 ; RV64-BITS-512-NEXT:    addi a0, a0, -1
 ; RV64-BITS-512-NEXT:    vid.v v9
 ; RV64-BITS-512-NEXT:    vrsub.vx v9, v9, a0
@@ -290,12 +290,14 @@ define <vscale x 16 x i1> @reverse_nxv16i1(<vscale x 16 x i1> %a) {
 ; RV32-BITS-UNKNOWN-LABEL: reverse_nxv16i1:
 ; RV32-BITS-UNKNOWN:       # %bb.0:
 ; RV32-BITS-UNKNOWN-NEXT:    csrr a0, vlenb
-; RV32-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; RV32-BITS-UNKNOWN-NEXT:    vid.v v8
+; RV32-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV32-BITS-UNKNOWN-NEXT:    vmv.v.i v10, 0
+; RV32-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
+; RV32-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; RV32-BITS-UNKNOWN-NEXT:    vrsub.vx v8, v8, a0
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vmv.v.i v10, 0
 ; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v10, v10, 1, v0
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v13, v10, v8
@@ -307,12 +309,14 @@ define <vscale x 16 x i1> @reverse_nxv16i1(<vscale x 16 x i1> %a) {
 ; RV32-BITS-256-LABEL: reverse_nxv16i1:
 ; RV32-BITS-256:       # %bb.0:
 ; RV32-BITS-256-NEXT:    csrr a0, vlenb
-; RV32-BITS-256-NEXT:    addi a0, a0, -1
 ; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV32-BITS-256-NEXT:    vid.v v8
+; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV32-BITS-256-NEXT:    vmv.v.i v10, 0
+; RV32-BITS-256-NEXT:    addi a0, a0, -1
+; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV32-BITS-256-NEXT:    vrsub.vx v8, v8, a0
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; RV32-BITS-256-NEXT:    vmv.v.i v10, 0
 ; RV32-BITS-256-NEXT:    vmerge.vim v10, v10, 1, v0
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV32-BITS-256-NEXT:    vrgather.vv v13, v10, v8
@@ -324,12 +328,14 @@ define <vscale x 16 x i1> @reverse_nxv16i1(<vscale x 16 x i1> %a) {
 ; RV32-BITS-512-LABEL: reverse_nxv16i1:
 ; RV32-BITS-512:       # %bb.0:
 ; RV32-BITS-512-NEXT:    csrr a0, vlenb
-; RV32-BITS-512-NEXT:    addi a0, a0, -1
 ; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV32-BITS-512-NEXT:    vid.v v8
+; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV32-BITS-512-NEXT:    vmv.v.i v10, 0
+; RV32-BITS-512-NEXT:    addi a0, a0, -1
+; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV32-BITS-512-NEXT:    vrsub.vx v8, v8, a0
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; RV32-BITS-512-NEXT:    vmv.v.i v10, 0
 ; RV32-BITS-512-NEXT:    vmerge.vim v10, v10, 1, v0
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV32-BITS-512-NEXT:    vrgather.vv v13, v10, v8
@@ -341,12 +347,14 @@ define <vscale x 16 x i1> @reverse_nxv16i1(<vscale x 16 x i1> %a) {
 ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv16i1:
 ; RV64-BITS-UNKNOWN:       # %bb.0:
 ; RV64-BITS-UNKNOWN-NEXT:    csrr a0, vlenb
-; RV64-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; RV64-BITS-UNKNOWN-NEXT:    vid.v v8
+; RV64-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV64-BITS-UNKNOWN-NEXT:    vmv.v.i v10, 0
+; RV64-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
+; RV64-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; RV64-BITS-UNKNOWN-NEXT:    vrsub.vx v8, v8, a0
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; RV64-BITS-UNKNOWN-NEXT:    vmv.v.i v10, 0
 ; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v10, v10, 1, v0
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v13, v10, v8
@@ -358,12 +366,14 @@ define <vscale x 16 x i1> @reverse_nxv16i1(<vscale x 16 x i1> %a) {
 ; RV64-BITS-256-LABEL: reverse_nxv16i1:
 ; RV64-BITS-256:       # %bb.0:
 ; RV64-BITS-256-NEXT:    csrr a0, vlenb
-; RV64-BITS-256-NEXT:    addi a0, a0, -1
 ; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV64-BITS-256-NEXT:    vid.v v8
+; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV64-BITS-256-NEXT:    vmv.v.i v10, 0
+; RV64-BITS-256-NEXT:    addi a0, a0, -1
+; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV64-BITS-256-NEXT:    vrsub.vx v8, v8, a0
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; RV64-BITS-256-NEXT:    vmv.v.i v10, 0
 ; RV64-BITS-256-NEXT:    vmerge.vim v10, v10, 1, v0
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV64-BITS-256-NEXT:    vrgather.vv v13, v10, v8
@@ -375,12 +385,14 @@ define <vscale x 16 x i1> @reverse_nxv16i1(<vscale x 16 x i1> %a) {
 ; RV64-BITS-512-LABEL: reverse_nxv16i1:
 ; RV64-BITS-512:       # %bb.0:
 ; RV64-BITS-512-NEXT:    csrr a0, vlenb
-; RV64-BITS-512-NEXT:    addi a0, a0, -1
 ; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV64-BITS-512-NEXT:    vid.v v8
+; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV64-BITS-512-NEXT:    vmv.v.i v10, 0
+; RV64-BITS-512-NEXT:    addi a0, a0, -1
+; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV64-BITS-512-NEXT:    vrsub.vx v8, v8, a0
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; RV64-BITS-512-NEXT:    vmv.v.i v10, 0
 ; RV64-BITS-512-NEXT:    vmerge.vim v10, v10, 1, v0
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV64-BITS-512-NEXT:    vrgather.vv v13, v10, v8
@@ -396,18 +408,20 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV32-BITS-UNKNOWN-LABEL: reverse_nxv32i1:
 ; RV32-BITS-UNKNOWN:       # %bb.0:
 ; RV32-BITS-UNKNOWN-NEXT:    csrr a0, vlenb
-; RV32-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; RV32-BITS-UNKNOWN-NEXT:    vid.v v8
-; RV32-BITS-UNKNOWN-NEXT:    vrsub.vx v12, v8, a0
+; RV32-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; RV32-BITS-UNKNOWN-NEXT:    vmv.v.i v12, 0
+; RV32-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
+; RV32-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV32-BITS-UNKNOWN-NEXT:    vrsub.vx v16, v8, a0
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vmv.v.i v8, 0
-; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v16, v8, 1, v0
+; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v12, v12, 1, v0
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v11, v16, v12
-; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v10, v17, v12
-; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v9, v18, v12
-; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v8, v19, v12
+; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v11, v12, v16
+; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v10, v13, v16
+; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v9, v14, v16
+; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v8, v15, v16
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; RV32-BITS-UNKNOWN-NEXT:    vmsne.vi v0, v8, 0
 ; RV32-BITS-UNKNOWN-NEXT:    ret
@@ -415,18 +429,20 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV32-BITS-256-LABEL: reverse_nxv32i1:
 ; RV32-BITS-256:       # %bb.0:
 ; RV32-BITS-256-NEXT:    csrr a0, vlenb
-; RV32-BITS-256-NEXT:    addi a0, a0, -1
 ; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV32-BITS-256-NEXT:    vid.v v8
-; RV32-BITS-256-NEXT:    vrsub.vx v12, v8, a0
+; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; RV32-BITS-256-NEXT:    vmv.v.i v12, 0
+; RV32-BITS-256-NEXT:    addi a0, a0, -1
+; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV32-BITS-256-NEXT:    vrsub.vx v16, v8, a0
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV32-BITS-256-NEXT:    vmv.v.i v8, 0
-; RV32-BITS-256-NEXT:    vmerge.vim v16, v8, 1, v0
+; RV32-BITS-256-NEXT:    vmerge.vim v12, v12, 1, v0
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV32-BITS-256-NEXT:    vrgather.vv v11, v16, v12
-; RV32-BITS-256-NEXT:    vrgather.vv v10, v17, v12
-; RV32-BITS-256-NEXT:    vrgather.vv v9, v18, v12
-; RV32-BITS-256-NEXT:    vrgather.vv v8, v19, v12
+; RV32-BITS-256-NEXT:    vrgather.vv v11, v12, v16
+; RV32-BITS-256-NEXT:    vrgather.vv v10, v13, v16
+; RV32-BITS-256-NEXT:    vrgather.vv v9, v14, v16
+; RV32-BITS-256-NEXT:    vrgather.vv v8, v15, v16
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; RV32-BITS-256-NEXT:    vmsne.vi v0, v8, 0
 ; RV32-BITS-256-NEXT:    ret
@@ -434,18 +450,20 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV32-BITS-512-LABEL: reverse_nxv32i1:
 ; RV32-BITS-512:       # %bb.0:
 ; RV32-BITS-512-NEXT:    csrr a0, vlenb
-; RV32-BITS-512-NEXT:    addi a0, a0, -1
 ; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV32-BITS-512-NEXT:    vid.v v8
-; RV32-BITS-512-NEXT:    vrsub.vx v12, v8, a0
+; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; RV32-BITS-512-NEXT:    vmv.v.i v12, 0
+; RV32-BITS-512-NEXT:    addi a0, a0, -1
+; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV32-BITS-512-NEXT:    vrsub.vx v16, v8, a0
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV32-BITS-512-NEXT:    vmv.v.i v8, 0
-; RV32-BITS-512-NEXT:    vmerge.vim v16, v8, 1, v0
+; RV32-BITS-512-NEXT:    vmerge.vim v12, v12, 1, v0
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV32-BITS-512-NEXT:    vrgather.vv v11, v16, v12
-; RV32-BITS-512-NEXT:    vrgather.vv v10, v17, v12
-; RV32-BITS-512-NEXT:    vrgather.vv v9, v18, v12
-; RV32-BITS-512-NEXT:    vrgather.vv v8, v19, v12
+; RV32-BITS-512-NEXT:    vrgather.vv v11, v12, v16
+; RV32-BITS-512-NEXT:    vrgather.vv v10, v13, v16
+; RV32-BITS-512-NEXT:    vrgather.vv v9, v14, v16
+; RV32-BITS-512-NEXT:    vrgather.vv v8, v15, v16
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; RV32-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV32-BITS-512-NEXT:    ret
@@ -453,18 +471,20 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv32i1:
 ; RV64-BITS-UNKNOWN:       # %bb.0:
 ; RV64-BITS-UNKNOWN-NEXT:    csrr a0, vlenb
-; RV64-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; RV64-BITS-UNKNOWN-NEXT:    vid.v v8
-; RV64-BITS-UNKNOWN-NEXT:    vrsub.vx v12, v8, a0
+; RV64-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; RV64-BITS-UNKNOWN-NEXT:    vmv.v.i v12, 0
+; RV64-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
+; RV64-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV64-BITS-UNKNOWN-NEXT:    vrsub.vx v16, v8, a0
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV64-BITS-UNKNOWN-NEXT:    vmv.v.i v8, 0
-; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v16, v8, 1, v0
+; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v12, v12, 1, v0
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v11, v16, v12
-; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v10, v17, v12
-; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v9, v18, v12
-; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v8, v19, v12
+; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v11, v12, v16
+; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v10, v13, v16
+; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v9, v14, v16
+; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v8, v15, v16
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; RV64-BITS-UNKNOWN-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-UNKNOWN-NEXT:    ret
@@ -472,18 +492,20 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV64-BITS-256-LABEL: reverse_nxv32i1:
 ; RV64-BITS-256:       # %bb.0:
 ; RV64-BITS-256-NEXT:    csrr a0, vlenb
-; RV64-BITS-256-NEXT:    addi a0, a0, -1
 ; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV64-BITS-256-NEXT:    vid.v v8
-; RV64-BITS-256-NEXT:    vrsub.vx v12, v8, a0
+; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; RV64-BITS-256-NEXT:    vmv.v.i v12, 0
+; RV64-BITS-256-NEXT:    addi a0, a0, -1
+; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV64-BITS-256-NEXT:    vrsub.vx v16, v8, a0
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV64-BITS-256-NEXT:    vmv.v.i v8, 0
-; RV64-BITS-256-NEXT:    vmerge.vim v16, v8, 1, v0
+; RV64-BITS-256-NEXT:    vmerge.vim v12, v12, 1, v0
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV64-BITS-256-NEXT:    vrgather.vv v11, v16, v12
-; RV64-BITS-256-NEXT:    vrgather.vv v10, v17, v12
-; RV64-BITS-256-NEXT:    vrgather.vv v9, v18, v12
-; RV64-BITS-256-NEXT:    vrgather.vv v8, v19, v12
+; RV64-BITS-256-NEXT:    vrgather.vv v11, v12, v16
+; RV64-BITS-256-NEXT:    vrgather.vv v10, v13, v16
+; RV64-BITS-256-NEXT:    vrgather.vv v9, v14, v16
+; RV64-BITS-256-NEXT:    vrgather.vv v8, v15, v16
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; RV64-BITS-256-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-256-NEXT:    ret
@@ -491,18 +513,20 @@ define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) {
 ; RV64-BITS-512-LABEL: reverse_nxv32i1:
 ; RV64-BITS-512:       # %bb.0:
 ; RV64-BITS-512-NEXT:    csrr a0, vlenb
-; RV64-BITS-512-NEXT:    addi a0, a0, -1
 ; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV64-BITS-512-NEXT:    vid.v v8
-; RV64-BITS-512-NEXT:    vrsub.vx v12, v8, a0
+; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; RV64-BITS-512-NEXT:    vmv.v.i v12, 0
+; RV64-BITS-512-NEXT:    addi a0, a0, -1
+; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; RV64-BITS-512-NEXT:    vrsub.vx v16, v8, a0
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; RV64-BITS-512-NEXT:    vmv.v.i v8, 0
-; RV64-BITS-512-NEXT:    vmerge.vim v16, v8, 1, v0
+; RV64-BITS-512-NEXT:    vmerge.vim v12, v12, 1, v0
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV64-BITS-512-NEXT:    vrgather.vv v11, v16, v12
-; RV64-BITS-512-NEXT:    vrgather.vv v10, v17, v12
-; RV64-BITS-512-NEXT:    vrgather.vv v9, v18, v12
-; RV64-BITS-512-NEXT:    vrgather.vv v8, v19, v12
+; RV64-BITS-512-NEXT:    vrgather.vv v11, v12, v16
+; RV64-BITS-512-NEXT:    vrgather.vv v10, v13, v16
+; RV64-BITS-512-NEXT:    vrgather.vv v9, v14, v16
+; RV64-BITS-512-NEXT:    vrgather.vv v8, v15, v16
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; RV64-BITS-512-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-BITS-512-NEXT:    ret
@@ -514,13 +538,15 @@ define <vscale x 64 x i1> @reverse_nxv64i1(<vscale x 64 x i1> %a) {
 ; RV32-BITS-UNKNOWN-LABEL: reverse_nxv64i1:
 ; RV32-BITS-UNKNOWN:       # %bb.0:
 ; RV32-BITS-UNKNOWN-NEXT:    csrr a0, vlenb
-; RV32-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; RV32-BITS-UNKNOWN-NEXT:    vid.v v8
+; RV32-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; RV32-BITS-UNKNOWN-NEXT:    vmv.v.i v24, 0
+; RV32-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
+; RV32-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; RV32-BITS-UNKNOWN-NEXT:    vrsub.vx v16, v8, a0
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; RV32-BITS-UNKNOWN-NEXT:    vmv.v.i v8, 0
-; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v24, v8, 1, v0
+; RV32-BITS-UNKNOWN-NEXT:    vmerge.vim v24, v24, 1, v0
 ; RV32-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v15, v24, v16
 ; RV32-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v14, v25, v16
@@ -537,13 +563,15 @@ define <vscale x 64 x i1> @reverse_nxv64i1(<vscale x 64 x i1> %a) {
 ; RV32-BITS-256-LABEL: reverse_nxv64i1:
 ; RV32-BITS-256:       # %bb.0:
 ; RV32-BITS-256-NEXT:    csrr a0, vlenb
-; RV32-BITS-256-NEXT:    addi a0, a0, -1
 ; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV32-BITS-256-NEXT:    vid.v v8
+; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; RV32-BITS-256-NEXT:    vmv.v.i v16, 0
+; RV32-BITS-256-NEXT:    addi a0, a0, -1
+; RV32-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV32-BITS-256-NEXT:    vrsub.vx v24, v8, a0
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; RV32-BITS-256-NEXT:    vmv.v.i v8, 0
-; RV32-BITS-256-NEXT:    vmerge.vim v16, v8, 1, v0
+; RV32-BITS-256-NEXT:    vmerge.vim v16, v16, 1, v0
 ; RV32-BITS-256-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV32-BITS-256-NEXT:    vrgather.vv v15, v16, v24
 ; RV32-BITS-256-NEXT:    vrgather.vv v14, v17, v24
@@ -560,13 +588,15 @@ define <vscale x 64 x i1> @reverse_nxv64i1(<vscale x 64 x i1> %a) {
 ; RV32-BITS-512-LABEL: reverse_nxv64i1:
 ; RV32-BITS-512:       # %bb.0:
 ; RV32-BITS-512-NEXT:    csrr a0, vlenb
-; RV32-BITS-512-NEXT:    addi a0, a0, -1
 ; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV32-BITS-512-NEXT:    vid.v v8
+; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; RV32-BITS-512-NEXT:    vmv.v.i v16, 0
+; RV32-BITS-512-NEXT:    addi a0, a0, -1
+; RV32-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV32-BITS-512-NEXT:    vrsub.vx v24, v8, a0
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; RV32-BITS-512-NEXT:    vmv.v.i v8, 0
-; RV32-BITS-512-NEXT:    vmerge.vim v16, v8, 1, v0
+; RV32-BITS-512-NEXT:    vmerge.vim v16, v16, 1, v0
 ; RV32-BITS-512-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV32-BITS-512-NEXT:    vrgather.vv v15, v16, v24
 ; RV32-BITS-512-NEXT:    vrgather.vv v14, v17, v24
@@ -583,13 +613,15 @@ define <vscale x 64 x i1> @reverse_nxv64i1(<vscale x 64 x i1> %a) {
 ; RV64-BITS-UNKNOWN-LABEL: reverse_nxv64i1:
 ; RV64-BITS-UNKNOWN:       # %bb.0:
 ; RV64-BITS-UNKNOWN-NEXT:    csrr a0, vlenb
-; RV64-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; RV64-BITS-UNKNOWN-NEXT:    vid.v v8
+; RV64-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; RV64-BITS-UNKNOWN-NEXT:    vmv.v.i v24, 0
+; RV64-BITS-UNKNOWN-NEXT:    addi a0, a0, -1
+; RV64-BITS-UNKNOWN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; RV64-BITS-UNKNOWN-NEXT:    vrsub.vx v16, v8, a0
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; RV64-BITS-UNKNOWN-NEXT:    vmv.v.i v8, 0
-; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v24, v8, 1, v0
+; RV64-BITS-UNKNOWN-NEXT:    vmerge.vim v24, v24, 1, v0
 ; RV64-BITS-UNKNOWN-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v15, v24, v16
 ; RV64-BITS-UNKNOWN-NEXT:    vrgatherei16.vv v14, v25, v16
@@ -606,13 +638,15 @@ define <vscale x 64 x i1> @reverse_nxv64i1(<vscale x 64 x i1> %a) {
 ; RV64-BITS-256-LABEL: reverse_nxv64i1:
 ; RV64-BITS-256:       # %bb.0:
 ; RV64-BITS-256-NEXT:    csrr a0, vlenb
-; RV64-BITS-256-NEXT:    addi a0, a0, -1
 ; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV64-BITS-256-NEXT:    vid.v v8
+; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; RV64-BITS-256-NEXT:    vmv.v.i v16, 0
+; RV64-BITS-256-NEXT:    addi a0, a0, -1
+; RV64-BITS-256-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV64-BITS-256-NEXT:    vrsub.vx v24, v8, a0
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; RV64-BITS-256-NEXT:    vmv.v.i v8, 0
-; RV64-BITS-256-NEXT:    vmerge.vim v16, v8, 1, v0
+; RV64-BITS-256-NEXT:    vmerge.vim v16, v16, 1, v0
 ; RV64-BITS-256-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV64-BITS-256-NEXT:    vrgather.vv v15, v16, v24
 ; RV64-BITS-256-NEXT:    vrgather.vv v14, v17, v24
@@ -629,13 +663,15 @@ define <vscale x 64 x i1> @reverse_nxv64i1(<vscale x 64 x i1> %a) {
 ; RV64-BITS-512-LABEL: reverse_nxv64i1:
 ; RV64-BITS-512:       # %bb.0:
 ; RV64-BITS-512-NEXT:    csrr a0, vlenb
-; RV64-BITS-512-NEXT:    addi a0, a0, -1
 ; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV64-BITS-512-NEXT:    vid.v v8
+; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; RV64-BITS-512-NEXT:    vmv.v.i v16, 0
+; RV64-BITS-512-NEXT:    addi a0, a0, -1
+; RV64-BITS-512-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; RV64-BITS-512-NEXT:    vrsub.vx v24, v8, a0
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; RV64-BITS-512-NEXT:    vmv.v.i v8, 0
-; RV64-BITS-512-NEXT:    vmerge.vim v16, v8, 1, v0
+; RV64-BITS-512-NEXT:    vmerge.vim v16, v16, 1, v0
 ; RV64-BITS-512-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV64-BITS-512-NEXT:    vrgather.vv v15, v16, v24
 ; RV64-BITS-512-NEXT:    vrgather.vv v14, v17, v24
@@ -1929,10 +1965,10 @@ define <vscale x 6 x i64> @reverse_nxv6i64(<vscale x 6 x i64> %a) {
 ; CHECK-LABEL: reverse_nxv6i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vid.v v16
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vrsub.vx v22, v16, a0
 ; CHECK-NEXT:    vrgather.vv v21, v10, v22
 ; CHECK-NEXT:    vrgather.vv v19, v12, v22
@@ -1967,27 +2003,27 @@ define <vscale x 12 x i64> @reverse_nxv12i64(<vscale x 12 x i64> %a) {
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vid.v v20
 ; RV32-NEXT:    srli a1, a0, 3
 ; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
-; RV32-NEXT:    vid.v v20
 ; RV32-NEXT:    vrsub.vx v20, v20, a1
 ; RV32-NEXT:    vrgather.vv v31, v12, v20
+; RV32-NEXT:    vrgather.vv v7, v8, v20
 ; RV32-NEXT:    vrgather.vv v30, v13, v20
+; RV32-NEXT:    vrgather.vv v6, v9, v20
 ; RV32-NEXT:    vrgather.vv v29, v14, v20
+; RV32-NEXT:    vrgather.vv v5, v10, v20
 ; RV32-NEXT:    vrgather.vv v28, v15, v20
+; RV32-NEXT:    vrgather.vv v4, v11, v20
+; RV32-NEXT:    addi a1, sp, 64
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    vrgather.vv v27, v16, v20
+; RV32-NEXT:    vs4r.v v4, (a0)
 ; RV32-NEXT:    vrgather.vv v26, v17, v20
 ; RV32-NEXT:    vrgather.vv v25, v18, v20
 ; RV32-NEXT:    vrgather.vv v24, v19, v20
-; RV32-NEXT:    vrgather.vv v15, v8, v20
-; RV32-NEXT:    vrgather.vv v14, v9, v20
-; RV32-NEXT:    vrgather.vv v13, v10, v20
-; RV32-NEXT:    vrgather.vv v12, v11, v20
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    addi a1, sp, 64
-; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    vs4r.v v12, (a0)
 ; RV32-NEXT:    vs8r.v v24, (a1)
 ; RV32-NEXT:    vl8re64.v v16, (a0)
 ; RV32-NEXT:    vl8re64.v v8, (a1)
@@ -2016,27 +2052,27 @@ define <vscale x 12 x i64> @reverse_nxv12i64(<vscale x 12 x i64> %a) {
 ; RV64-NEXT:    sub sp, sp, a0
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV64-NEXT:    vid.v v20
 ; RV64-NEXT:    srli a1, a0, 3
 ; RV64-NEXT:    addi a1, a1, -1
-; RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
-; RV64-NEXT:    vid.v v20
 ; RV64-NEXT:    vrsub.vx v20, v20, a1
 ; RV64-NEXT:    vrgather.vv v31, v12, v20
+; RV64-NEXT:    vrgather.vv v7, v8, v20
 ; RV64-NEXT:    vrgather.vv v30, v13, v20
+; RV64-NEXT:    vrgather.vv v6, v9, v20
 ; RV64-NEXT:    vrgather.vv v29, v14, v20
+; RV64-NEXT:    vrgather.vv v5, v10, v20
 ; RV64-NEXT:    vrgather.vv v28, v15, v20
+; RV64-NEXT:    vrgather.vv v4, v11, v20
+; RV64-NEXT:    addi a1, sp, 64
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, a1, a0
 ; RV64-NEXT:    vrgather.vv v27, v16, v20
+; RV64-NEXT:    vs4r.v v4, (a0)
 ; RV64-NEXT:    vrgather.vv v26, v17, v20
 ; RV64-NEXT:    vrgather.vv v25, v18, v20
 ; RV64-NEXT:    vrgather.vv v24, v19, v20
-; RV64-NEXT:    vrgather.vv v15, v8, v20
-; RV64-NEXT:    vrgather.vv v14, v9, v20
-; RV64-NEXT:    vrgather.vv v13, v10, v20
-; RV64-NEXT:    vrgather.vv v12, v11, v20
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    addi a1, sp, 64
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    vs4r.v v12, (a0)
 ; RV64-NEXT:    vs8r.v v24, (a1)
 ; RV64-NEXT:    vl8re64.v v16, (a0)
 ; RV64-NEXT:    vl8re64.v v8, (a1)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
index b43655283b9757..94fce80ad3b8e4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll
@@ -19,10 +19,10 @@ define <vscale x 1 x bfloat> @vp_nearbyint_nxv1bf16(<vscale x 1 x bfloat> %va, <
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
@@ -44,10 +44,10 @@ define <vscale x 1 x bfloat> @vp_nearbyint_nxv1bf16_unmasked(<vscale x 1 x bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -69,10 +69,10 @@ define <vscale x 2 x bfloat> @vp_nearbyint_nxv2bf16(<vscale x 2 x bfloat> %va, <
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
@@ -94,10 +94,10 @@ define <vscale x 2 x bfloat> @vp_nearbyint_nxv2bf16_unmasked(<vscale x 2 x bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -120,10 +120,10 @@ define <vscale x 4 x bfloat> @vp_nearbyint_nxv4bf16(<vscale x 4 x bfloat> %va, <
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
@@ -146,10 +146,10 @@ define <vscale x 4 x bfloat> @vp_nearbyint_nxv4bf16_unmasked(<vscale x 4 x bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -172,10 +172,10 @@ define <vscale x 8 x bfloat> @vp_nearbyint_nxv8bf16(<vscale x 8 x bfloat> %va, <
 ; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
@@ -198,10 +198,10 @@ define <vscale x 8 x bfloat> @vp_nearbyint_nxv8bf16_unmasked(<vscale x 8 x bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -224,10 +224,10 @@ define <vscale x 16 x bfloat> @vp_nearbyint_nxv16bf16(<vscale x 16 x bfloat> %va
 ; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
@@ -250,10 +250,10 @@ define <vscale x 16 x bfloat> @vp_nearbyint_nxv16bf16_unmasked(<vscale x 16 x bf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -273,50 +273,40 @@ declare <vscale x 32 x bfloat> @llvm.vp.nearbyint.nxv32bf16(<vscale x 32 x bfloa
 define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv32bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    fmv.w.x fa5, a3
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v17, v0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vmv1r.v v0, v17
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24, v0.t
-; CHECK-NEXT:    lui a2, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a2
+; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
 ; CHECK-NEXT:    frflags a2
-; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    fsflags a2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
@@ -332,12 +322,6 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16(<vscale x 32 x bfloat> %va
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
 ; CHECK-NEXT:    fsflags a0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.nearbyint.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -346,51 +330,41 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16(<vscale x 32 x bfloat> %va
 define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv32bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v16
+; CHECK-NEXT:    fmv.w.x fa5, a3
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v16, v16, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24, v0.t
-; CHECK-NEXT:    lui a2, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a2
+; CHECK-NEXT:    vslidedown.vx v12, v24, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; CHECK-NEXT:    frflags a2
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    fsflags a2
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
@@ -402,12 +376,6 @@ define <vscale x 32 x bfloat> @vp_nearbyint_nxv32bf16_unmasked(<vscale x 32 x bf
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    fsflags a0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.nearbyint.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -436,10 +404,10 @@ define <vscale x 1 x half> @vp_nearbyint_nxv1f16(<vscale x 1 x half> %va, <vscal
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a0
@@ -476,10 +444,10 @@ define <vscale x 1 x half> @vp_nearbyint_nxv1f16_unmasked(<vscale x 1 x half> %v
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -518,10 +486,10 @@ define <vscale x 2 x half> @vp_nearbyint_nxv2f16(<vscale x 2 x half> %va, <vscal
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a0
@@ -558,10 +526,10 @@ define <vscale x 2 x half> @vp_nearbyint_nxv2f16_unmasked(<vscale x 2 x half> %v
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -601,10 +569,10 @@ define <vscale x 4 x half> @vp_nearbyint_nxv4f16(<vscale x 4 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a0
@@ -642,10 +610,10 @@ define <vscale x 4 x half> @vp_nearbyint_nxv4f16_unmasked(<vscale x 4 x half> %v
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -665,9 +633,9 @@ declare <vscale x 8 x half> @llvm.vp.nearbyint.nxv8f16(<vscale x 8 x half>, <vsc
 define <vscale x 8 x half> @vp_nearbyint_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_nearbyint_nxv8f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI18_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI18_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -687,10 +655,10 @@ define <vscale x 8 x half> @vp_nearbyint_nxv8f16(<vscale x 8 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a0
@@ -728,10 +696,10 @@ define <vscale x 8 x half> @vp_nearbyint_nxv8f16_unmasked(<vscale x 8 x half> %v
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -751,9 +719,9 @@ declare <vscale x 16 x half> @llvm.vp.nearbyint.nxv16f16(<vscale x 16 x half>, <
 define <vscale x 16 x half> @vp_nearbyint_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_nearbyint_nxv16f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI20_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI20_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
@@ -773,10 +741,10 @@ define <vscale x 16 x half> @vp_nearbyint_nxv16f16(<vscale x 16 x half> %va, <vs
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a0
@@ -814,10 +782,10 @@ define <vscale x 16 x half> @vp_nearbyint_nxv16f16_unmasked(<vscale x 16 x half>
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    frflags a0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -837,9 +805,9 @@ declare <vscale x 32 x half> @llvm.vp.nearbyint.nxv32f16(<vscale x 32 x half>, <
 define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_nearbyint_nxv32f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI22_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI22_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
@@ -856,50 +824,40 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    lui a2, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v12, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    fsflags a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
@@ -915,12 +873,6 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    fsflags a0
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.nearbyint.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -944,51 +896,41 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
 ;
 ; ZVFHMIN-LABEL: vp_nearbyint_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v16
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    lui a2, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    frflags a2
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    fsflags a2
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
@@ -1000,12 +942,6 @@ define <vscale x 32 x half> @vp_nearbyint_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    fsflags a0
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.nearbyint.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
@@ -1274,9 +1210,9 @@ declare <vscale x 2 x double> @llvm.vp.nearbyint.nxv2f64(<vscale x 2 x double>,
 define <vscale x 2 x double> @vp_nearbyint_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv2f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI36_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI36_0)(a1)
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -1318,9 +1254,9 @@ declare <vscale x 4 x double> @llvm.vp.nearbyint.nxv4f64(<vscale x 4 x double>,
 define <vscale x 4 x double> @vp_nearbyint_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv4f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI38_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI38_0)(a1)
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
@@ -1362,9 +1298,9 @@ declare <vscale x 7 x double> @llvm.vp.nearbyint.nxv7f64(<vscale x 7 x double>,
 define <vscale x 7 x double> @vp_nearbyint_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv7f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI40_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI40_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -1406,9 +1342,9 @@ declare <vscale x 8 x double> @llvm.vp.nearbyint.nxv8f64(<vscale x 8 x double>,
 define <vscale x 8 x double> @vp_nearbyint_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_nearbyint_nxv8f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI42_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI42_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -1453,12 +1389,12 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64(<vscale x 16 x double> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v6, v0, a2
+; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
+; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    lui a3, %hi(.LCPI44_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a3)
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v6, v0, a3
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
@@ -1501,12 +1437,12 @@ define <vscale x 16 x double> @vp_nearbyint_nxv16f64_unmasked(<vscale x 16 x dou
 ; CHECK-LABEL: vp_nearbyint_nxv16f64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    lui a3, %hi(.LCPI45_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI45_0)(a3)
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    lui a2, %hi(.LCPI45_0)
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5

diff  --git a/llvm/test/CodeGen/RISCV/rvv/pr104480.ll b/llvm/test/CodeGen/RISCV/rvv/pr104480.ll
index afa12ea03c0bfa..21f1984001d3c5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr104480.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr104480.ll
@@ -44,16 +44,16 @@ define <vscale x 4 x i64> @test_mulhs_expand(<vscale x 4 x i64> %broadcast.splat
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a1, a0, 1365
+; CHECK-NEXT:    addi a1, sp, 8
+; CHECK-NEXT:    addi a2, a0, 1365
 ; CHECK-NEXT:    addi a0, a0, 1366
 ; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
+; CHECK-NEXT:    sw a2, 12(sp)
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vlse64.v v12, (a1), zero
 ; CHECK-NEXT:    vrgather.vi v16, v8, 0
-; CHECK-NEXT:    vmulh.vv v8, v16, v12
 ; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vmulh.vv v8, v16, v12
 ; CHECK-NEXT:    vsrl.vx v12, v8, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
 ; CHECK-NEXT:    li a0, 3
@@ -73,13 +73,13 @@ define <vscale x 4 x i64> @test_mulhu_expand(<vscale x 4 x i64> %broadcast.splat
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    lui a0, 699051
-; CHECK-NEXT:    addi a1, a0, -1366
+; CHECK-NEXT:    addi a1, sp, 8
+; CHECK-NEXT:    addi a2, a0, -1366
 ; CHECK-NEXT:    addi a0, a0, -1365
 ; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    sw a1, 12(sp)
-; CHECK-NEXT:    addi a0, sp, 8
-; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vlse64.v v12, (a0), zero
+; CHECK-NEXT:    sw a2, 12(sp)
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vlse64.v v12, (a1), zero
 ; CHECK-NEXT:    vrgather.vi v16, v8, 0
 ; CHECK-NEXT:    vmulhu.vv v8, v16, v12
 ; CHECK-NEXT:    vsrl.vi v8, v8, 1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/pr52475.ll b/llvm/test/CodeGen/RISCV/rvv/pr52475.ll
index a885f23eef39e9..bca4d9c24d6006 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr52475.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr52475.ll
@@ -9,8 +9,8 @@ define <128 x i32> @ret_split_v128i32(ptr %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    addi a2, a1, 448
+; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    vle32.v v8, (a2)
 ; CHECK-NEXT:    addi a2, a0, 448
 ; CHECK-NEXT:    vse32.v v8, (a2)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/pr61561.ll b/llvm/test/CodeGen/RISCV/rvv/pr61561.ll
index c5fd6943e51bef..6b08c4409fb635 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr61561.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr61561.ll
@@ -5,14 +5,15 @@ define <vscale x 4 x i8> @foo(ptr %p) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vl1re16.v v8, (a0)
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vsll.vi v8, v8, 3
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    li a0, 248
-; CHECK-NEXT:    vand.vx v8, v10, a0
 ; CHECK-NEXT:    lui a0, 4
+; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a0
+; CHECK-NEXT:    li a0, 248
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsll.vi v8, v8, 3
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vzext.vf2 v12, v8
+; CHECK-NEXT:    vand.vx v8, v12, a0
 ; CHECK-NEXT:    lui a0, 1
 ; CHECK-NEXT:    addi a0, a0, -361
 ; CHECK-NEXT:    vmacc.vx v10, a0, v8

diff  --git a/llvm/test/CodeGen/RISCV/rvv/pr88576.ll b/llvm/test/CodeGen/RISCV/rvv/pr88576.ll
index e8a8d9e422ac18..37c67b9ff2f6af 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr88576.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr88576.ll
@@ -23,13 +23,13 @@ define i1 @foo(<vscale x 16 x i8> %x, i64 %y) {
 ; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    andi sp, sp, -64
+; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    addi a2, sp, 64
-; CHECK-NEXT:    add a0, a2, a0
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    vmerge.vim v24, v16, 1, v0
 ; CHECK-NEXT:    vs8r.v v24, (a1)
 ; CHECK-NEXT:    vmv1r.v v0, v8

diff  --git a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
index fd3c4593462bf8..06a357eeaeb61e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr95865.ll
@@ -35,8 +35,8 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    .cfi_offset s9, -88
 ; CHECK-NEXT:    .cfi_offset s10, -96
 ; CHECK-NEXT:    .cfi_offset s11, -104
-; CHECK-NEXT:    li s2, 0
-; CHECK-NEXT:    li a7, 8
+; CHECK-NEXT:    li a6, 0
+; CHECK-NEXT:    li s2, 8
 ; CHECK-NEXT:    li t0, 12
 ; CHECK-NEXT:    li s0, 4
 ; CHECK-NEXT:    li t1, 20
@@ -55,8 +55,8 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    mv t4, t1
 ; CHECK-NEXT:    mv t5, t2
 ; CHECK-NEXT:    mv t6, t0
-; CHECK-NEXT:    mv s3, a7
-; CHECK-NEXT:    mv a6, s2
+; CHECK-NEXT:    mv a7, s2
+; CHECK-NEXT:    mv s4, a6
 ; CHECK-NEXT:  .LBB0_2: # %for.cond5.preheader.i
 ; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
 ; CHECK-NEXT:    # => This Loop Header: Depth=2
@@ -66,8 +66,8 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    mv s5, t4
 ; CHECK-NEXT:    mv s6, t5
 ; CHECK-NEXT:    mv s7, t6
-; CHECK-NEXT:    mv s8, s3
-; CHECK-NEXT:    mv s4, a6
+; CHECK-NEXT:    mv s3, a7
+; CHECK-NEXT:    mv s9, s4
 ; CHECK-NEXT:  .LBB0_3: # %for.cond9.preheader.i
 ; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
 ; CHECK-NEXT:    # Parent Loop BB0_2 Depth=2
@@ -77,32 +77,32 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    mv s11, s5
 ; CHECK-NEXT:    mv a3, s6
 ; CHECK-NEXT:    mv ra, s7
-; CHECK-NEXT:    mv a4, s8
-; CHECK-NEXT:    mv s9, s4
+; CHECK-NEXT:    mv s8, s3
+; CHECK-NEXT:    mv s1, s9
 ; CHECK-NEXT:  .LBB0_4: # %vector.ph.i
 ; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
 ; CHECK-NEXT:    # Parent Loop BB0_2 Depth=2
 ; CHECK-NEXT:    # Parent Loop BB0_3 Depth=3
 ; CHECK-NEXT:    # => This Loop Header: Depth=4
 ; CHECK-NEXT:    # Child Loop BB0_5 Depth 5
-; CHECK-NEXT:    li a5, 0
+; CHECK-NEXT:    li a1, 0
 ; CHECK-NEXT:  .LBB0_5: # %vector.body.i
 ; CHECK-NEXT:    # Parent Loop BB0_1 Depth=1
 ; CHECK-NEXT:    # Parent Loop BB0_2 Depth=2
 ; CHECK-NEXT:    # Parent Loop BB0_3 Depth=3
 ; CHECK-NEXT:    # Parent Loop BB0_4 Depth=4
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=5
-; CHECK-NEXT:    addi s1, a5, 4
-; CHECK-NEXT:    add a1, a4, a5
+; CHECK-NEXT:    addi a5, a1, 4
+; CHECK-NEXT:    add a4, s8, a1
+; CHECK-NEXT:    add a1, a1, a3
+; CHECK-NEXT:    vse32.v v8, (a4), v0.t
 ; CHECK-NEXT:    vse32.v v8, (a1), v0.t
-; CHECK-NEXT:    add a5, a5, a3
-; CHECK-NEXT:    vse32.v v8, (a5), v0.t
-; CHECK-NEXT:    mv a5, s1
-; CHECK-NEXT:    bne s1, s0, .LBB0_5
+; CHECK-NEXT:    mv a1, a5
+; CHECK-NEXT:    bne a5, s0, .LBB0_5
 ; CHECK-NEXT:  # %bb.6: # %for.cond.cleanup15.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_4 Depth=4
-; CHECK-NEXT:    addi s9, s9, 4
-; CHECK-NEXT:    addi a4, a4, 4
+; CHECK-NEXT:    addi s1, s1, 4
+; CHECK-NEXT:    addi s8, s8, 4
 ; CHECK-NEXT:    addi ra, ra, 4
 ; CHECK-NEXT:    addi a3, a3, 4
 ; CHECK-NEXT:    andi s10, a0, 1
@@ -110,8 +110,8 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    beqz s10, .LBB0_4
 ; CHECK-NEXT:  # %bb.7: # %for.cond.cleanup11.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=3
-; CHECK-NEXT:    addi s4, s4, 4
-; CHECK-NEXT:    addi s8, s8, 4
+; CHECK-NEXT:    addi s9, s9, 4
+; CHECK-NEXT:    addi s3, s3, 4
 ; CHECK-NEXT:    addi s7, s7, 4
 ; CHECK-NEXT:    addi s6, s6, 4
 ; CHECK-NEXT:    andi a1, a2, 1
@@ -119,16 +119,16 @@ define i32 @main(i1 %arg.1, i64 %arg.2, i1 %arg.3, i64 %arg.4, i1 %arg.5, <vscal
 ; CHECK-NEXT:    beqz a1, .LBB0_3
 ; CHECK-NEXT:  # %bb.8: # %for.cond.cleanup7.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_2 Depth=2
-; CHECK-NEXT:    addi a6, a6, 4
-; CHECK-NEXT:    addi s3, s3, 4
+; CHECK-NEXT:    addi s4, s4, 4
+; CHECK-NEXT:    addi a7, a7, 4
 ; CHECK-NEXT:    addi t6, t6, 4
 ; CHECK-NEXT:    addi t5, t5, 4
 ; CHECK-NEXT:    addi t4, t4, 4
 ; CHECK-NEXT:    beqz t3, .LBB0_2
 ; CHECK-NEXT:  # %bb.9: # %for.cond.cleanup3.i
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    addi a6, a6, 4
 ; CHECK-NEXT:    addi s2, s2, 4
-; CHECK-NEXT:    addi a7, a7, 4
 ; CHECK-NEXT:    addi t0, t0, 4
 ; CHECK-NEXT:    addi t2, t2, 4
 ; CHECK-NEXT:    addi t1, t1, 4

diff  --git a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
index 859d87d096962b..2a69dd31118bd8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
@@ -19,10 +19,10 @@ define <vscale x 1 x bfloat> @vp_rint_nxv1bf16(<vscale x 1 x bfloat> %va, <vscal
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -42,10 +42,10 @@ define <vscale x 1 x bfloat> @vp_rint_nxv1bf16_unmasked(<vscale x 1 x bfloat> %v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -65,10 +65,10 @@ define <vscale x 2 x bfloat> @vp_rint_nxv2bf16(<vscale x 2 x bfloat> %va, <vscal
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -88,10 +88,10 @@ define <vscale x 2 x bfloat> @vp_rint_nxv2bf16_unmasked(<vscale x 2 x bfloat> %v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -112,10 +112,10 @@ define <vscale x 4 x bfloat> @vp_rint_nxv4bf16(<vscale x 4 x bfloat> %va, <vscal
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v9
@@ -136,10 +136,10 @@ define <vscale x 4 x bfloat> @vp_rint_nxv4bf16_unmasked(<vscale x 4 x bfloat> %v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -160,10 +160,10 @@ define <vscale x 8 x bfloat> @vp_rint_nxv8bf16(<vscale x 8 x bfloat> %va, <vscal
 ; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v10
@@ -184,10 +184,10 @@ define <vscale x 8 x bfloat> @vp_rint_nxv8bf16_unmasked(<vscale x 8 x bfloat> %v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -208,10 +208,10 @@ define <vscale x 16 x bfloat> @vp_rint_nxv16bf16(<vscale x 16 x bfloat> %va, <vs
 ; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
@@ -232,10 +232,10 @@ define <vscale x 16 x bfloat> @vp_rint_nxv16bf16_unmasked(<vscale x 16 x bfloat>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -259,53 +259,47 @@ define <vscale x 32 x bfloat> @vp_rint_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    fmv.w.x fa5, a3
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v17, v0, a2
+; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vmv1r.v v0, v17
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24, v0.t
-; CHECK-NEXT:    lui a2, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a2
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vmv1r.v v8, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -327,49 +321,39 @@ define <vscale x 32 x bfloat> @vp_rint_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 define <vscale x 32 x bfloat> @vp_rint_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv32bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v16
+; CHECK-NEXT:    fmv.w.x fa5, a3
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v16, v16, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24, v0.t
-; CHECK-NEXT:    lui a2, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a2
+; CHECK-NEXT:    vslidedown.vx v12, v24, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v8, fa5, v0.t
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
@@ -379,12 +363,6 @@ define <vscale x 32 x bfloat> @vp_rint_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.rint.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -411,10 +389,10 @@ define <vscale x 1 x half> @vp_rint_nxv1f16(<vscale x 1 x half> %va, <vscale x 1
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
@@ -447,10 +425,10 @@ define <vscale x 1 x half> @vp_rint_nxv1f16_unmasked(<vscale x 1 x half> %va, i3
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -485,10 +463,10 @@ define <vscale x 2 x half> @vp_rint_nxv2f16(<vscale x 2 x half> %va, <vscale x 2
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
@@ -521,10 +499,10 @@ define <vscale x 2 x half> @vp_rint_nxv2f16_unmasked(<vscale x 2 x half> %va, i3
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -560,10 +538,10 @@ define <vscale x 4 x half> @vp_rint_nxv4f16(<vscale x 4 x half> %va, <vscale x 4
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
@@ -597,10 +575,10 @@ define <vscale x 4 x half> @vp_rint_nxv4f16_unmasked(<vscale x 4 x half> %va, i3
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -618,9 +596,9 @@ declare <vscale x 8 x half> @llvm.vp.rint.nxv8f16(<vscale x 8 x half>, <vscale x
 define <vscale x 8 x half> @vp_rint_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_rint_nxv8f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI18_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI18_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -638,10 +616,10 @@ define <vscale x 8 x half> @vp_rint_nxv8f16(<vscale x 8 x half> %va, <vscale x 8
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v10
@@ -675,10 +653,10 @@ define <vscale x 8 x half> @vp_rint_nxv8f16_unmasked(<vscale x 8 x half> %va, i3
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -696,9 +674,9 @@ declare <vscale x 16 x half> @llvm.vp.rint.nxv16f16(<vscale x 16 x half>, <vscal
 define <vscale x 16 x half> @vp_rint_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_rint_nxv16f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI20_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI20_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
@@ -716,10 +694,10 @@ define <vscale x 16 x half> @vp_rint_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
@@ -753,10 +731,10 @@ define <vscale x 16 x half> @vp_rint_nxv16f16_unmasked(<vscale x 16 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
@@ -774,9 +752,9 @@ declare <vscale x 32 x half> @llvm.vp.rint.nxv32f16(<vscale x 32 x half>, <vscal
 define <vscale x 32 x half> @vp_rint_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_rint_nxv32f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI22_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI22_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
@@ -797,53 +775,47 @@ define <vscale x 32 x half> @vp_rint_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    lui a2, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vmflt.vf v12, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vmv1r.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
@@ -878,49 +850,39 @@ define <vscale x 32 x half> @vp_rint_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vp_rint_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v16
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    lui a2, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
@@ -930,12 +892,6 @@ define <vscale x 32 x half> @vp_rint_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.rint.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
@@ -1180,9 +1136,9 @@ declare <vscale x 2 x double> @llvm.vp.rint.nxv2f64(<vscale x 2 x double>, <vsca
 define <vscale x 2 x double> @vp_rint_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv2f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI36_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI36_0)(a1)
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -1220,9 +1176,9 @@ declare <vscale x 4 x double> @llvm.vp.rint.nxv4f64(<vscale x 4 x double>, <vsca
 define <vscale x 4 x double> @vp_rint_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv4f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI38_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI38_0)(a1)
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
@@ -1260,9 +1216,9 @@ declare <vscale x 7 x double> @llvm.vp.rint.nxv7f64(<vscale x 7 x double>, <vsca
 define <vscale x 7 x double> @vp_rint_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv7f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI40_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI40_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -1300,9 +1256,9 @@ declare <vscale x 8 x double> @llvm.vp.rint.nxv8f64(<vscale x 8 x double>, <vsca
 define <vscale x 8 x double> @vp_rint_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv8f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI42_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI42_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -1349,12 +1305,12 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v6, v0, a2
+; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
+; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    lui a3, %hi(.LCPI44_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a3)
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v6, v0, a3
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
@@ -1402,12 +1358,12 @@ define <vscale x 16 x double> @vp_rint_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-LABEL: vp_rint_nxv16f64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    lui a3, %hi(.LCPI45_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI45_0)(a3)
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    lui a2, %hi(.LCPI45_0)
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5

diff  --git a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll
index 4c5835afd49e64..3bbdd1a257fdb5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll
@@ -8,17 +8,17 @@ define float @reduce_fadd(ptr %f) {
 ; CHECK-LABEL: reduce_fadd:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    srli a1, a2, 1
-; CHECK-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v8, zero
+; CHECK-NEXT:    srli a1, a2, 1
 ; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    li a3, 1024
 ; CHECK-NEXT:  .LBB0_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v10, (a0)
+; CHECK-NEXT:    sub a3, a3, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfredosum.vs v8, v10, v8
-; CHECK-NEXT:    sub a3, a3, a1
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    bnez a3, .LBB0_1
 ; CHECK-NEXT:  # %bb.2: # %exit

diff  --git a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
index 9ed28248e0cc10..8a10e75333ad0a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
@@ -19,10 +19,10 @@ define <vscale x 1 x bfloat> @vp_round_nxv1bf16(<vscale x 1 x bfloat> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 4
@@ -44,10 +44,10 @@ define <vscale x 1 x bfloat> @vp_round_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -69,10 +69,10 @@ define <vscale x 2 x bfloat> @vp_round_nxv2bf16(<vscale x 2 x bfloat> %va, <vsca
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 4
@@ -94,10 +94,10 @@ define <vscale x 2 x bfloat> @vp_round_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -120,10 +120,10 @@ define <vscale x 4 x bfloat> @vp_round_nxv4bf16(<vscale x 4 x bfloat> %va, <vsca
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 4
@@ -146,10 +146,10 @@ define <vscale x 4 x bfloat> @vp_round_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -172,10 +172,10 @@ define <vscale x 8 x bfloat> @vp_round_nxv8bf16(<vscale x 8 x bfloat> %va, <vsca
 ; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 4
@@ -198,10 +198,10 @@ define <vscale x 8 x bfloat> @vp_round_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -224,10 +224,10 @@ define <vscale x 16 x bfloat> @vp_round_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 ; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 4
@@ -250,10 +250,10 @@ define <vscale x 16 x bfloat> @vp_round_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -279,62 +279,54 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    fmv.w.x fa5, a3
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v17, v0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vmv1r.v v0, v17
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24, v0.t
-; CHECK-NEXT:    lui a2, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a2
+; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 4
-; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    vmv1r.v v8, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -354,51 +346,41 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 define <vscale x 32 x bfloat> @vp_round_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv32bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v16
+; CHECK-NEXT:    fmv.w.x fa5, a3
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v16, v16, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24, v0.t
-; CHECK-NEXT:    lui a2, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a2
+; CHECK-NEXT:    vslidedown.vx v12, v24, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 4
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
@@ -410,12 +392,6 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.round.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -444,10 +420,10 @@ define <vscale x 1 x half> @vp_round_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -484,10 +460,10 @@ define <vscale x 1 x half> @vp_round_nxv1f16_unmasked(<vscale x 1 x half> %va, i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -526,10 +502,10 @@ define <vscale x 2 x half> @vp_round_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -566,10 +542,10 @@ define <vscale x 2 x half> @vp_round_nxv2f16_unmasked(<vscale x 2 x half> %va, i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -609,10 +585,10 @@ define <vscale x 4 x half> @vp_round_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -650,10 +626,10 @@ define <vscale x 4 x half> @vp_round_nxv4f16_unmasked(<vscale x 4 x half> %va, i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -673,9 +649,9 @@ declare <vscale x 8 x half> @llvm.vp.round.nxv8f16(<vscale x 8 x half>, <vscale
 define <vscale x 8 x half> @vp_round_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_nxv8f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI18_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI18_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -695,10 +671,10 @@ define <vscale x 8 x half> @vp_round_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -736,10 +712,10 @@ define <vscale x 8 x half> @vp_round_nxv8f16_unmasked(<vscale x 8 x half> %va, i
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -759,9 +735,9 @@ declare <vscale x 16 x half> @llvm.vp.round.nxv16f16(<vscale x 16 x half>, <vsca
 define <vscale x 16 x half> @vp_round_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_nxv16f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI20_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI20_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
@@ -781,10 +757,10 @@ define <vscale x 16 x half> @vp_round_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
@@ -822,10 +798,10 @@ define <vscale x 16 x half> @vp_round_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -845,9 +821,9 @@ declare <vscale x 32 x half> @llvm.vp.round.nxv32f16(<vscale x 32 x half>, <vsca
 define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_round_nxv32f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI22_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI22_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
@@ -870,62 +846,54 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vmv1r.v v16, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    lui a2, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v12, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v8, v16
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -960,51 +928,41 @@ define <vscale x 32 x half> @vp_round_nxv32f16_unmasked(<vscale x 32 x half> %va
 ;
 ; ZVFHMIN-LABEL: vp_round_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v16
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    lui a2, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
@@ -1016,12 +974,6 @@ define <vscale x 32 x half> @vp_round_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.round.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
@@ -1290,9 +1242,9 @@ declare <vscale x 2 x double> @llvm.vp.round.nxv2f64(<vscale x 2 x double>, <vsc
 define <vscale x 2 x double> @vp_round_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv2f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI36_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI36_0)(a1)
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -1334,9 +1286,9 @@ declare <vscale x 4 x double> @llvm.vp.round.nxv4f64(<vscale x 4 x double>, <vsc
 define <vscale x 4 x double> @vp_round_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv4f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI38_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI38_0)(a1)
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
@@ -1378,9 +1330,9 @@ declare <vscale x 7 x double> @llvm.vp.round.nxv7f64(<vscale x 7 x double>, <vsc
 define <vscale x 7 x double> @vp_round_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv7f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI40_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI40_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -1422,9 +1374,9 @@ declare <vscale x 8 x double> @llvm.vp.round.nxv8f64(<vscale x 8 x double>, <vsc
 define <vscale x 8 x double> @vp_round_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv8f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI42_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI42_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -1475,12 +1427,12 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v6, v0, a2
+; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
+; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    lui a3, %hi(.LCPI44_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a3)
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v6, v0, a3
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
@@ -1501,23 +1453,26 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -1533,12 +1488,12 @@ define <vscale x 16 x double> @vp_round_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-LABEL: vp_round_nxv16f64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    lui a3, %hi(.LCPI45_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI45_0)(a3)
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    lui a2, %hi(.LCPI45_0)
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5

diff  --git a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
index 3fdb354bff94bf..4cd909e4b0a637 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
@@ -19,10 +19,10 @@ define <vscale x 1 x bfloat> @vp_roundeven_nxv1bf16(<vscale x 1 x bfloat> %va, <
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 0
@@ -44,10 +44,10 @@ define <vscale x 1 x bfloat> @vp_roundeven_nxv1bf16_unmasked(<vscale x 1 x bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -69,10 +69,10 @@ define <vscale x 2 x bfloat> @vp_roundeven_nxv2bf16(<vscale x 2 x bfloat> %va, <
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 0
@@ -94,10 +94,10 @@ define <vscale x 2 x bfloat> @vp_roundeven_nxv2bf16_unmasked(<vscale x 2 x bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -120,10 +120,10 @@ define <vscale x 4 x bfloat> @vp_roundeven_nxv4bf16(<vscale x 4 x bfloat> %va, <
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 0
@@ -146,10 +146,10 @@ define <vscale x 4 x bfloat> @vp_roundeven_nxv4bf16_unmasked(<vscale x 4 x bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -172,10 +172,10 @@ define <vscale x 8 x bfloat> @vp_roundeven_nxv8bf16(<vscale x 8 x bfloat> %va, <
 ; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 0
@@ -198,10 +198,10 @@ define <vscale x 8 x bfloat> @vp_roundeven_nxv8bf16_unmasked(<vscale x 8 x bfloa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -224,10 +224,10 @@ define <vscale x 16 x bfloat> @vp_roundeven_nxv16bf16(<vscale x 16 x bfloat> %va
 ; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 0
@@ -250,10 +250,10 @@ define <vscale x 16 x bfloat> @vp_roundeven_nxv16bf16_unmasked(<vscale x 16 x bf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -279,62 +279,54 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16(<vscale x 32 x bfloat> %va
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    fmv.w.x fa5, a3
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v17, v0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vmv1r.v v0, v17
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24, v0.t
-; CHECK-NEXT:    lui a2, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a2
+; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 0
-; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    vmv1r.v v8, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -354,51 +346,41 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16(<vscale x 32 x bfloat> %va
 define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv32bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v16
+; CHECK-NEXT:    fmv.w.x fa5, a3
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v16, v16, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24, v0.t
-; CHECK-NEXT:    lui a2, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a2
+; CHECK-NEXT:    vslidedown.vx v12, v24, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 0
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
@@ -410,12 +392,6 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16_unmasked(<vscale x 32 x bf
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.roundeven.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -444,10 +420,10 @@ define <vscale x 1 x half> @vp_roundeven_nxv1f16(<vscale x 1 x half> %va, <vscal
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -484,10 +460,10 @@ define <vscale x 1 x half> @vp_roundeven_nxv1f16_unmasked(<vscale x 1 x half> %v
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -526,10 +502,10 @@ define <vscale x 2 x half> @vp_roundeven_nxv2f16(<vscale x 2 x half> %va, <vscal
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -566,10 +542,10 @@ define <vscale x 2 x half> @vp_roundeven_nxv2f16_unmasked(<vscale x 2 x half> %v
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -609,10 +585,10 @@ define <vscale x 4 x half> @vp_roundeven_nxv4f16(<vscale x 4 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -650,10 +626,10 @@ define <vscale x 4 x half> @vp_roundeven_nxv4f16_unmasked(<vscale x 4 x half> %v
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -673,9 +649,9 @@ declare <vscale x 8 x half> @llvm.vp.roundeven.nxv8f16(<vscale x 8 x half>, <vsc
 define <vscale x 8 x half> @vp_roundeven_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_nxv8f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI18_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI18_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -695,10 +671,10 @@ define <vscale x 8 x half> @vp_roundeven_nxv8f16(<vscale x 8 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -736,10 +712,10 @@ define <vscale x 8 x half> @vp_roundeven_nxv8f16_unmasked(<vscale x 8 x half> %v
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -759,9 +735,9 @@ declare <vscale x 16 x half> @llvm.vp.roundeven.nxv16f16(<vscale x 16 x half>, <
 define <vscale x 16 x half> @vp_roundeven_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_nxv16f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI20_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI20_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
@@ -781,10 +757,10 @@ define <vscale x 16 x half> @vp_roundeven_nxv16f16(<vscale x 16 x half> %va, <vs
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
@@ -822,10 +798,10 @@ define <vscale x 16 x half> @vp_roundeven_nxv16f16_unmasked(<vscale x 16 x half>
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -845,9 +821,9 @@ declare <vscale x 32 x half> @llvm.vp.roundeven.nxv32f16(<vscale x 32 x half>, <
 define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundeven_nxv32f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI22_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI22_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
@@ -870,62 +846,54 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vmv1r.v v16, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    lui a2, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v12, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v8, v16
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -960,51 +928,41 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16_unmasked(<vscale x 32 x half>
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v16
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    lui a2, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
@@ -1016,12 +974,6 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16_unmasked(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.roundeven.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
@@ -1290,9 +1242,9 @@ declare <vscale x 2 x double> @llvm.vp.roundeven.nxv2f64(<vscale x 2 x double>,
 define <vscale x 2 x double> @vp_roundeven_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv2f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI36_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI36_0)(a1)
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -1334,9 +1286,9 @@ declare <vscale x 4 x double> @llvm.vp.roundeven.nxv4f64(<vscale x 4 x double>,
 define <vscale x 4 x double> @vp_roundeven_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv4f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI38_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI38_0)(a1)
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
@@ -1378,9 +1330,9 @@ declare <vscale x 7 x double> @llvm.vp.roundeven.nxv7f64(<vscale x 7 x double>,
 define <vscale x 7 x double> @vp_roundeven_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv7f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI40_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI40_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -1422,9 +1374,9 @@ declare <vscale x 8 x double> @llvm.vp.roundeven.nxv8f64(<vscale x 8 x double>,
 define <vscale x 8 x double> @vp_roundeven_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv8f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI42_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI42_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -1475,12 +1427,12 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v6, v0, a2
+; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
+; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    lui a3, %hi(.LCPI44_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a3)
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v6, v0, a3
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
@@ -1501,23 +1453,26 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -1533,12 +1488,12 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64_unmasked(<vscale x 16 x dou
 ; CHECK-LABEL: vp_roundeven_nxv16f64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    lui a3, %hi(.LCPI45_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI45_0)(a3)
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    lui a2, %hi(.LCPI45_0)
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5

diff  --git a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
index d999f65af4e346..96c821a76ae84e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
@@ -19,10 +19,10 @@ define <vscale x 1 x bfloat> @vp_roundtozero_nxv1bf16(<vscale x 1 x bfloat> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
@@ -44,10 +44,10 @@ define <vscale x 1 x bfloat> @vp_roundtozero_nxv1bf16_unmasked(<vscale x 1 x bfl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -69,10 +69,10 @@ define <vscale x 2 x bfloat> @vp_roundtozero_nxv2bf16(<vscale x 2 x bfloat> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
@@ -94,10 +94,10 @@ define <vscale x 2 x bfloat> @vp_roundtozero_nxv2bf16_unmasked(<vscale x 2 x bfl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v9
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -120,10 +120,10 @@ define <vscale x 4 x bfloat> @vp_roundtozero_nxv4bf16(<vscale x 4 x bfloat> %va,
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v10, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
@@ -146,10 +146,10 @@ define <vscale x 4 x bfloat> @vp_roundtozero_nxv4bf16_unmasked(<vscale x 4 x bfl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -172,10 +172,10 @@ define <vscale x 8 x bfloat> @vp_roundtozero_nxv8bf16(<vscale x 8 x bfloat> %va,
 ; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v12, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; CHECK-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
@@ -198,10 +198,10 @@ define <vscale x 8 x bfloat> @vp_roundtozero_nxv8bf16_unmasked(<vscale x 8 x bfl
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v12
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -224,10 +224,10 @@ define <vscale x 16 x bfloat> @vp_roundtozero_nxv16bf16(<vscale x 16 x bfloat> %
 ; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
@@ -250,10 +250,10 @@ define <vscale x 16 x bfloat> @vp_roundtozero_nxv16bf16_unmasked(<vscale x 16 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    lui a1, 307200
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v16
-; CHECK-NEXT:    lui a0, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a0
+; CHECK-NEXT:    fmv.w.x fa5, a1
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -279,62 +279,54 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16(<vscale x 32 x bfloat> %
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    fmv.w.x fa5, a3
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v17, v0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vmv1r.v v0, v17
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24, v0.t
-; CHECK-NEXT:    lui a2, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a2
+; CHECK-NEXT:    vslidedown.vx v12, v0, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 1
-; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    vmv1r.v v8, v16
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v8, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -354,51 +346,41 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16(<vscale x 32 x bfloat> %
 define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv32bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    lui a3, 307200
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v16
+; CHECK-NEXT:    fmv.w.x fa5, a3
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v16, v16, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vmv1r.v v0, v16
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v24, v0.t
-; CHECK-NEXT:    lui a2, 307200
-; CHECK-NEXT:    fmv.w.x fa5, a2
+; CHECK-NEXT:    vslidedown.vx v12, v24, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a2, 1
-; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    bltu a0, a1, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB11_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5
@@ -410,12 +392,6 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16_unmasked(<vscale x 32 x
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.roundtozero.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -444,10 +420,10 @@ define <vscale x 1 x half> @vp_roundtozero_nxv1f16(<vscale x 1 x half> %va, <vsc
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
@@ -484,10 +460,10 @@ define <vscale x 1 x half> @vp_roundtozero_nxv1f16_unmasked(<vscale x 1 x half>
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -526,10 +502,10 @@ define <vscale x 2 x half> @vp_roundtozero_nxv2f16(<vscale x 2 x half> %va, <vsc
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
@@ -566,10 +542,10 @@ define <vscale x 2 x half> @vp_roundtozero_nxv2f16_unmasked(<vscale x 2 x half>
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v9
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v9, v0.t
@@ -609,10 +585,10 @@ define <vscale x 4 x half> @vp_roundtozero_nxv4f16(<vscale x 4 x half> %va, <vsc
 ; ZVFHMIN-NEXT:    vmv1r.v v9, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v12, v10, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v9, v12, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
@@ -650,10 +626,10 @@ define <vscale x 4 x half> @vp_roundtozero_nxv4f16_unmasked(<vscale x 4 x half>
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
@@ -673,9 +649,9 @@ declare <vscale x 8 x half> @llvm.vp.roundtozero.nxv8f16(<vscale x 8 x half>, <v
 define <vscale x 8 x half> @vp_roundtozero_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_nxv8f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI18_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI18_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v10, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfabs.v v12, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
@@ -695,10 +671,10 @@ define <vscale x 8 x half> @vp_roundtozero_nxv8f16(<vscale x 8 x half> %va, <vsc
 ; ZVFHMIN-NEXT:    vmv1r.v v10, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v12, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v10, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
@@ -736,10 +712,10 @@ define <vscale x 8 x half> @vp_roundtozero_nxv8f16_unmasked(<vscale x 8 x half>
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v12
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v12, v0.t
@@ -759,9 +735,9 @@ declare <vscale x 16 x half> @llvm.vp.roundtozero.nxv16f16(<vscale x 16 x half>,
 define <vscale x 16 x half> @vp_roundtozero_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_nxv16f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI20_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI20_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v12, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfabs.v v16, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
@@ -781,10 +757,10 @@ define <vscale x 16 x half> @vp_roundtozero_nxv16f16(<vscale x 16 x half> %va, <
 ; ZVFHMIN-NEXT:    vmv1r.v v12, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
@@ -822,10 +798,10 @@ define <vscale x 16 x half> @vp_roundtozero_nxv16f16_unmasked(<vscale x 16 x hal
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    lui a1, 307200
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v16
-; ZVFHMIN-NEXT:    lui a0, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a1
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v16, v0.t
@@ -845,9 +821,9 @@ declare <vscale x 32 x half> @llvm.vp.roundtozero.nxv32f16(<vscale x 32 x half>,
 define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vp_roundtozero_nxv32f16:
 ; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    lui a1, %hi(.LCPI22_0)
 ; ZVFH-NEXT:    flh fa5, %lo(.LCPI22_0)(a1)
-; ZVFH-NEXT:    vmv1r.v v16, v0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfabs.v v24, v8, v0.t
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m8, ta, mu
@@ -870,62 +846,54 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vmv1r.v v16, v0
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v17, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    lui a2, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v17, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v12, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v17
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    vmv1r.v v8, v16
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v8, v16, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a0, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -960,51 +928,41 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16_unmasked(<vscale x 32 x hal
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    lui a3, 307200
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v16
+; ZVFHMIN-NEXT:    fmv.w.x fa5, a3
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfabs.v v8, v24, v0.t
-; ZVFHMIN-NEXT:    lui a2, 307200
-; ZVFHMIN-NEXT:    fmv.w.x fa5, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfabs.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vmflt.vf v16, v8, fa5, v0.t
+; ZVFHMIN-NEXT:    vmflt.vf v12, v24, fa5, v0.t
 ; ZVFHMIN-NEXT:    fsrmi a2, 1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v16
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; ZVFHMIN-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
-; ZVFHMIN-NEXT:    vfsgnj.vv v24, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v24, v16
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v24, fa5
@@ -1016,12 +974,6 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16_unmasked(<vscale x 32 x hal
 ; ZVFHMIN-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.roundtozero.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
@@ -1290,9 +1242,9 @@ declare <vscale x 2 x double> @llvm.vp.roundtozero.nxv2f64(<vscale x 2 x double>
 define <vscale x 2 x double> @vp_roundtozero_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv2f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI36_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI36_0)(a1)
-; CHECK-NEXT:    vmv1r.v v10, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v12, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
@@ -1334,9 +1286,9 @@ declare <vscale x 4 x double> @llvm.vp.roundtozero.nxv4f64(<vscale x 4 x double>
 define <vscale x 4 x double> @vp_roundtozero_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv4f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI38_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI38_0)(a1)
-; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
@@ -1378,9 +1330,9 @@ declare <vscale x 7 x double> @llvm.vp.roundtozero.nxv7f64(<vscale x 7 x double>
 define <vscale x 7 x double> @vp_roundtozero_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv7f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI40_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI40_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -1422,9 +1374,9 @@ declare <vscale x 8 x double> @llvm.vp.roundtozero.nxv8f64(<vscale x 8 x double>
 define <vscale x 8 x double> @vp_roundtozero_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv8f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    lui a1, %hi(.LCPI42_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI42_0)(a1)
-; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -1475,12 +1427,12 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v6, v0, a2
+; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
+; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a2)
 ; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    lui a3, %hi(.LCPI44_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI44_0)(a3)
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v6, v0, a3
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
@@ -1501,23 +1453,26 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v24, v8, v0.t
+; CHECK-NEXT:    vfabs.v v16, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
@@ -1533,12 +1488,12 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64_unmasked(<vscale x 16 x d
 ; CHECK-LABEL: vp_roundtozero_nxv16f64_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    lui a3, %hi(.LCPI45_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI45_0)(a3)
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    lui a2, %hi(.LCPI45_0)
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    fld fa5, %lo(.LCPI45_0)(a2)
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16
 ; CHECK-NEXT:    vmflt.vf v0, v24, fa5

diff  --git a/llvm/test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll
index 3988ae22598c79..b27ba14e85c839 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-args-by-mem.ll
@@ -39,11 +39,11 @@ define <vscale x 16 x i32> @foo(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5,
 ; CHECK-NEXT:    mv s1, sp
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    addi t0, s1, 64
-; CHECK-NEXT:    vs8r.v v8, (t0)
 ; CHECK-NEXT:    csrr t1, vlenb
 ; CHECK-NEXT:    slli t1, t1, 3
 ; CHECK-NEXT:    add t1, s1, t1
 ; CHECK-NEXT:    addi t1, t1, 64
+; CHECK-NEXT:    vs8r.v v8, (t0)
 ; CHECK-NEXT:    vs8r.v v8, (t1)
 ; CHECK-NEXT:    sd t1, 0(sp)
 ; CHECK-NEXT:    sd t0, 8(sp)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/rvv-framelayout.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-framelayout.ll
index ab644599448856..b9432bc568d9ca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-framelayout.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-framelayout.ll
@@ -24,8 +24,8 @@ define void @rvv_vla(i64 %n, i64 %i) nounwind {
 ; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    sub a2, s0, a2
 ; CHECK-NEXT:    addi a2, a2, -32
-; CHECK-NEXT:    vl2re64.v v8, (a2)
 ; CHECK-NEXT:    slli a1, a1, 2
+; CHECK-NEXT:    vl2re64.v v8, (a2)
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    lw zero, 0(a0)
 ; CHECK-NEXT:    addi sp, s0, -32
@@ -108,9 +108,9 @@ define void @rvv_vla_and_overaligned(i64 %n, i64 %i) nounwind {
 ; CHECK-NEXT:    addi a2, a2, 112
 ; CHECK-NEXT:    vl1re64.v v8, (a2)
 ; CHECK-NEXT:    addi a2, s1, 112
+; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    vl2re64.v v8, (a2)
 ; CHECK-NEXT:    lw zero, 64(s1)
-; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    lw zero, 0(a0)
 ; CHECK-NEXT:    addi sp, s0, -144

diff  --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 24b86b28e9a2c2..6c11e9413525e0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -269,13 +269,13 @@ define <vscale x 1 x i1> @fcmp_one_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmflt.vv v9, v10, v8, v0.t
-; CHECK-NEXT:    vmflt.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vmor.mm v0, v8, v9
+; CHECK-NEXT:    vmflt.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vmflt.vv v9, v10, v9, v0.t
+; CHECK-NEXT:    vmor.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
@@ -288,13 +288,13 @@ define <vscale x 1 x i1> @fcmp_one_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmflt.vv v9, v8, v10, v0.t
-; CHECK-NEXT:    vmflt.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    vmor.mm v0, v8, v9
+; CHECK-NEXT:    vmflt.vv v8, v10, v9, v0.t
+; CHECK-NEXT:    vmflt.vv v9, v9, v10, v0.t
+; CHECK-NEXT:    vmor.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
@@ -307,13 +307,11 @@ define <vscale x 1 x i1> @fcmp_ord_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscal
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v9, v10, v10, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v10, v10, v0.t
-; CHECK-NEXT:    vmand.mm v0, v8, v9
+; CHECK-NEXT:    vmfeq.vv v9, v9, v9, v0.t
+; CHECK-NEXT:    vmand.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"ord", <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i1> %v
@@ -324,15 +322,15 @@ define <vscale x 1 x i1> @fcmp_ord_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10, v0.t
+; CHECK-NEXT:    vmfeq.vv v9, v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v9, v10, v10, v0.t
-; CHECK-NEXT:    vmand.mm v0, v8, v9
+; CHECK-NEXT:    vmfeq.vv v8, v10, v10, v0.t
+; CHECK-NEXT:    vmand.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
@@ -345,15 +343,15 @@ define <vscale x 1 x i1> @fcmp_ord_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v10, v10, v0.t
+; CHECK-NEXT:    vmfeq.vv v9, v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfeq.vv v9, v10, v10, v0.t
-; CHECK-NEXT:    vmand.mm v0, v9, v8
+; CHECK-NEXT:    vmfeq.vv v8, v10, v10, v0.t
+; CHECK-NEXT:    vmand.mm v0, v8, v9
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
@@ -381,13 +379,13 @@ define <vscale x 1 x i1> @fcmp_ueq_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmflt.vv v9, v10, v8, v0.t
-; CHECK-NEXT:    vmflt.vv v8, v8, v10, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v8, v9
+; CHECK-NEXT:    vmflt.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vmflt.vv v9, v10, v9, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
@@ -400,13 +398,13 @@ define <vscale x 1 x i1> @fcmp_ueq_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmflt.vv v9, v8, v10, v0.t
-; CHECK-NEXT:    vmflt.vv v8, v10, v8, v0.t
-; CHECK-NEXT:    vmnor.mm v0, v8, v9
+; CHECK-NEXT:    vmflt.vv v8, v10, v9, v0.t
+; CHECK-NEXT:    vmflt.vv v9, v9, v10, v0.t
+; CHECK-NEXT:    vmnor.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
@@ -666,13 +664,11 @@ define <vscale x 1 x i1> @fcmp_uno_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscal
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfne.vv v9, v10, v10, v0.t
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmfne.vv v8, v10, v10, v0.t
-; CHECK-NEXT:    vmor.mm v0, v8, v9
+; CHECK-NEXT:    vmfne.vv v9, v9, v9, v0.t
+; CHECK-NEXT:    vmor.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, metadata !"uno", <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i1> %v
@@ -683,15 +679,15 @@ define <vscale x 1 x i1> @fcmp_uno_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfne.vv v8, v10, v10, v0.t
+; CHECK-NEXT:    vmfne.vv v9, v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfne.vv v9, v10, v10, v0.t
-; CHECK-NEXT:    vmor.mm v0, v8, v9
+; CHECK-NEXT:    vmfne.vv v8, v10, v10, v0.t
+; CHECK-NEXT:    vmor.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
@@ -704,15 +700,15 @@ define <vscale x 1 x i1> @fcmp_uno_vf_swap_nxv1bf16(<vscale x 1 x bfloat> %va, b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfne.vv v8, v10, v10, v0.t
+; CHECK-NEXT:    vmfne.vv v9, v9, v9, v0.t
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vmfne.vv v9, v10, v10, v0.t
-; CHECK-NEXT:    vmor.mm v0, v9, v8
+; CHECK-NEXT:    vmfne.vv v8, v10, v10, v0.t
+; CHECK-NEXT:    vmor.mm v0, v8, v9
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
@@ -1008,9 +1004,9 @@ define <vscale x 8 x i1> @fcmp_one_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    vmflt.vv v9, v16, v12, v0.t
@@ -1027,9 +1023,9 @@ define <vscale x 8 x i1> @fcmp_one_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmflt.vv v9, v12, v16, v0.t
@@ -1063,15 +1059,15 @@ define <vscale x 8 x i1> @fcmp_ord_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12, v0.t
+; CHECK-NEXT:    vmfeq.vv v10, v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v9, v12, v12, v0.t
-; CHECK-NEXT:    vmand.mm v0, v8, v9
+; CHECK-NEXT:    vmfeq.vv v8, v12, v12, v0.t
+; CHECK-NEXT:    vmand.mm v0, v10, v8
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -1084,15 +1080,15 @@ define <vscale x 8 x i1> @fcmp_ord_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12, v0.t
+; CHECK-NEXT:    vmfeq.vv v10, v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v9, v12, v12, v0.t
-; CHECK-NEXT:    vmand.mm v0, v9, v8
+; CHECK-NEXT:    vmfeq.vv v8, v12, v12, v0.t
+; CHECK-NEXT:    vmand.mm v0, v8, v10
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -1120,9 +1116,9 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; CHECK-NEXT:    vmflt.vv v9, v16, v12, v0.t
@@ -1139,9 +1135,9 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    vmflt.vv v9, v12, v16, v0.t
@@ -1425,15 +1421,15 @@ define <vscale x 8 x i1> @fcmp_uno_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v8, v12, v12, v0.t
+; CHECK-NEXT:    vmfne.vv v10, v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v9, v12, v12, v0.t
-; CHECK-NEXT:    vmor.mm v0, v8, v9
+; CHECK-NEXT:    vmfne.vv v8, v12, v12, v0.t
+; CHECK-NEXT:    vmor.mm v0, v10, v8
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -1446,15 +1442,15 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8bf16(<vscale x 8 x bfloat> %va, b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v8, v12, v12, v0.t
+; CHECK-NEXT:    vmfne.vv v10, v12, v12, v0.t
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v9, v12, v12, v0.t
-; CHECK-NEXT:    vmor.mm v0, v9, v8
+; CHECK-NEXT:    vmfne.vv v8, v12, v12, v0.t
+; CHECK-NEXT:    vmor.mm v0, v8, v10
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -1470,188 +1466,177 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    slli a1, a1, 2
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb
-; CHECK-NEXT:    vmv8r.v v24, v16
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a1, a3, 3
-; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    vl8re16.v v16, (a1)
 ; CHECK-NEXT:    slli a5, a3, 2
-; CHECK-NEXT:    sub a1, a2, a5
-; CHECK-NEXT:    sltu a4, a2, a1
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a6, a4, a1
 ; CHECK-NEXT:    slli a4, a3, 1
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    sub a6, a2, a5
+; CHECK-NEXT:    vl8re16.v v0, (a1)
+; CHECK-NEXT:    sltu a1, a2, a6
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a6, a1, a6
 ; CHECK-NEXT:    sub a1, a6, a4
 ; CHECK-NEXT:    sltu a7, a6, a1
 ; CHECK-NEXT:    addi a7, a7, -1
 ; CHECK-NEXT:    and a7, a7, a1
 ; CHECK-NEXT:    srli a1, a3, 1
+; CHECK-NEXT:    srli a3, a3, 2
 ; CHECK-NEXT:    csrr t0, vlenb
+; CHECK-NEXT:    slli t0, t0, 3
 ; CHECK-NEXT:    add t0, sp, t0
 ; CHECK-NEXT:    addi t0, t0, 16
-; CHECK-NEXT:    vs1r.v v0, (t0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs1r.v v24, (t0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli t0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v0, a1
-; CHECK-NEXT:    srli a3, a3, 2
-; CHECK-NEXT:    addi t0, sp, 16
-; CHECK-NEXT:    vs1r.v v8, (t0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vslidedown.vx v25, v24, a1
 ; CHECK-NEXT:    vsetvli t0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a3
+; CHECK-NEXT:    vslidedown.vx v24, v25, a3
 ; CHECK-NEXT:    vl8re16.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    mv t0, a0
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add t0, t0, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, t0
+; CHECK-NEXT:    slli t0, a0, 5
+; CHECK-NEXT:    add a0, t0, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    slli t0, a0, 3
+; CHECK-NEXT:    add a0, t0, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
-; CHECK-NEXT:    vmv4r.v v16, v24
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v4
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    mv t0, a0
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add a0, a0, t0
+; CHECK-NEXT:    slli t0, a0, 4
+; CHECK-NEXT:    add a0, t0, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a7, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v26, v16, v8, v0.t
+; CHECK-NEXT:    vmfeq.vv v6, v16, v8, v0.t
 ; CHECK-NEXT:    bltu a6, a4, .LBB85_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a6, a4
 ; CHECK-NEXT:  .LBB85_2:
+; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    slli a7, a0, 3
+; CHECK-NEXT:    add a0, a7, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    mv a7, a0
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add a0, a0, a7
+; CHECK-NEXT:    slli a7, a0, 4
+; CHECK-NEXT:    add a0, a7, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
 ; CHECK-NEXT:    vsetvli zero, a6, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v6, v16, v8, v0.t
+; CHECK-NEXT:    vmfeq.vv v5, v24, v8, v0.t
 ; CHECK-NEXT:    add a0, a3, a3
 ; CHECK-NEXT:    bltu a2, a5, .LBB85_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a2, a5
 ; CHECK-NEXT:  .LBB85_4:
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v6, v26, a3
 ; CHECK-NEXT:    sub a5, a2, a4
-; CHECK-NEXT:    sltu a6, a2, a5
-; CHECK-NEXT:    addi a6, a6, -1
-; CHECK-NEXT:    and a5, a6, a5
 ; CHECK-NEXT:    csrr a6, vlenb
+; CHECK-NEXT:    slli a6, a6, 3
 ; CHECK-NEXT:    add a6, sp, a6
 ; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl1r.v v8, (a6) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv1r.v v7, v8
+; CHECK-NEXT:    vl1r.v v7, (a6) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a6, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a3
+; CHECK-NEXT:    vslidedown.vx v0, v7, a3
 ; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 1
 ; CHECK-NEXT:    mv a7, a6
 ; CHECK-NEXT:    slli a6, a6, 3
+; CHECK-NEXT:    add a7, a7, a6
+; CHECK-NEXT:    slli a6, a6, 1
 ; CHECK-NEXT:    add a6, a6, a7
 ; CHECK-NEXT:    add a6, sp, a6
 ; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a6, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
 ; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 1
-; CHECK-NEXT:    mv a7, a6
-; CHECK-NEXT:    slli a6, a6, 2
-; CHECK-NEXT:    add a6, a6, a7
+; CHECK-NEXT:    slli a7, a6, 4
+; CHECK-NEXT:    add a6, a7, a6
 ; CHECK-NEXT:    add a6, sp, a6
 ; CHECK-NEXT:    addi a6, a6, 16
 ; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 1
-; CHECK-NEXT:    mv a7, a6
-; CHECK-NEXT:    slli a6, a6, 2
-; CHECK-NEXT:    add a7, a7, a6
-; CHECK-NEXT:    slli a6, a6, 1
-; CHECK-NEXT:    add a6, a6, a7
+; CHECK-NEXT:    slli a7, a6, 5
+; CHECK-NEXT:    add a6, a7, a6
 ; CHECK-NEXT:    add a6, sp, a6
 ; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT:    sltu a6, a2, a5
+; CHECK-NEXT:    addi a6, a6, -1
+; CHECK-NEXT:    and a5, a6, a5
 ; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 1
-; CHECK-NEXT:    mv a7, a6
-; CHECK-NEXT:    slli a6, a6, 2
-; CHECK-NEXT:    add a6, a6, a7
+; CHECK-NEXT:    slli a7, a6, 4
+; CHECK-NEXT:    add a6, a7, a6
 ; CHECK-NEXT:    add a6, sp, a6
 ; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v5, v16, v8, v0.t
+; CHECK-NEXT:    vmfeq.vv v4, v24, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v5, v6, a3
 ; CHECK-NEXT:    bltu a2, a4, .LBB85_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a2, a4
 ; CHECK-NEXT:  .LBB85_6:
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 1
-; CHECK-NEXT:    mv a5, a4
-; CHECK-NEXT:    slli a4, a4, 2
-; CHECK-NEXT:    add a5, a5, a4
-; CHECK-NEXT:    slli a4, a4, 1
-; CHECK-NEXT:    add a4, a4, a5
+; CHECK-NEXT:    slli a5, a4, 5
+; CHECK-NEXT:    add a4, a5, a4
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
 ; CHECK-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    vmfeq.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v5, a3
+; CHECK-NEXT:    vslideup.vx v8, v4, a3
 ; CHECK-NEXT:    add a0, a1, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v6, a1
+; CHECK-NEXT:    vslideup.vx v8, v5, a1
 ; CHECK-NEXT:    vmv.v.v v0, v8
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a1, a1, a0
+; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
@@ -2025,13 +2010,13 @@ define <vscale x 1 x i1> @fcmp_one_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vmflt.vv v8, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmflt.vv v8, v9, v10, v0.t
+; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v9, v0.t
+; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -2052,13 +2037,13 @@ define <vscale x 1 x i1> @fcmp_one_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v9, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v9, v0.t
+; ZVFHMIN-NEXT:    vmflt.vv v9, v9, v10, v0.t
+; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -2079,13 +2064,11 @@ define <vscale x 1 x i1> @fcmp_ord_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmfeq.vv v9, v9, v9, v0.t
+; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"ord", <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i1> %v
@@ -2106,15 +2089,15 @@ define <vscale x 1 x i1> @fcmp_ord_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v9, v9, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -2137,15 +2120,15 @@ define <vscale x 1 x i1> @fcmp_ord_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v9, v9, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v9, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -2189,13 +2172,13 @@ define <vscale x 1 x i1> @fcmp_ueq_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vmflt.vv v8, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vmnor.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmflt.vv v8, v9, v10, v0.t
+; ZVFHMIN-NEXT:    vmflt.vv v9, v10, v9, v0.t
+; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -2216,13 +2199,13 @@ define <vscale x 1 x i1> @fcmp_ueq_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmflt.vv v9, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v8, v0.t
-; ZVFHMIN-NEXT:    vmnor.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmflt.vv v8, v10, v9, v0.t
+; ZVFHMIN-NEXT:    vmflt.vv v9, v9, v10, v0.t
+; ZVFHMIN-NEXT:    vmnor.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -2592,13 +2575,11 @@ define <vscale x 1 x i1> @fcmp_uno_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmfne.vv v9, v9, v9, v0.t
+; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, metadata !"uno", <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x i1> %v
@@ -2619,15 +2600,15 @@ define <vscale x 1 x i1> @fcmp_uno_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmfne.vv v9, v9, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -2650,15 +2631,15 @@ define <vscale x 1 x i1> @fcmp_uno_vf_swap_nxv1f16(<vscale x 1 x half> %va, half
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v9, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmfne.vv v9, v9, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v9, v10, v10, v0.t
-; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmfne.vv v8, v10, v10, v0.t
+; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -3081,9 +3062,9 @@ define <vscale x 8 x i1> @fcmp_one_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v16, v12, v0.t
@@ -3108,9 +3089,9 @@ define <vscale x 8 x i1> @fcmp_one_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v16, v0.t
@@ -3162,15 +3143,15 @@ define <vscale x 8 x i1> @fcmp_ord_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v10, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v9, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
+; ZVFHMIN-NEXT:    vmand.mm v0, v10, v8
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -3193,15 +3174,15 @@ define <vscale x 8 x i1> @fcmp_ord_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v10, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v9, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
+; ZVFHMIN-NEXT:    vmand.mm v0, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -3245,9 +3226,9 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v16, v12, v0.t
@@ -3272,9 +3253,9 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v16, v0.t
@@ -3681,15 +3662,15 @@ define <vscale x 8 x i1> @fcmp_uno_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12, v0.t
+; ZVFHMIN-NEXT:    vmfne.vv v10, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v9, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12, v0.t
+; ZVFHMIN-NEXT:    vmor.mm v0, v10, v8
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -3712,15 +3693,15 @@ define <vscale x 8 x i1> @fcmp_uno_vf_swap_nxv8f16(<vscale x 8 x half> %va, half
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12, v0.t
+; ZVFHMIN-NEXT:    vmfne.vv v10, v12, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v9, v12, v12, v0.t
-; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12, v0.t
+; ZVFHMIN-NEXT:    vmor.mm v0, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -3748,18 +3729,18 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFH-NEXT:    csrr a3, vlenb
 ; ZVFH-NEXT:    srli a1, a3, 1
 ; ZVFH-NEXT:    slli a4, a3, 3
+; ZVFH-NEXT:    slli a3, a3, 2
 ; ZVFH-NEXT:    add a4, a0, a4
+; ZVFH-NEXT:    sub a5, a2, a3
 ; ZVFH-NEXT:    vl8re16.v v8, (a4)
-; ZVFH-NEXT:    slli a3, a3, 2
-; ZVFH-NEXT:    sub a4, a2, a3
-; ZVFH-NEXT:    sltu a5, a2, a4
-; ZVFH-NEXT:    addi a5, a5, -1
+; ZVFH-NEXT:    sltu a4, a2, a5
+; ZVFH-NEXT:    addi a4, a4, -1
 ; ZVFH-NEXT:    vl8re16.v v0, (a0)
 ; ZVFH-NEXT:    addi a0, sp, 16
 ; ZVFH-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; ZVFH-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; ZVFH-NEXT:    vslidedown.vx v0, v24, a1
-; ZVFH-NEXT:    and a4, a5, a4
+; ZVFH-NEXT:    and a4, a4, a5
 ; ZVFH-NEXT:    vsetvli zero, a4, e16, m8, ta, ma
 ; ZVFH-NEXT:    vmfeq.vv v7, v16, v8, v0.t
 ; ZVFH-NEXT:    bltu a2, a3, .LBB171_2
@@ -3793,188 +3774,177 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    mv a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a3, a3, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; ZVFHMIN-NEXT:    vmv1r.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    mv a3, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a3, a3, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    add a1, a1, a3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a3, 3
-; ZVFHMIN-NEXT:    add a1, a0, a1
-; ZVFHMIN-NEXT:    vl8re16.v v16, (a1)
 ; ZVFHMIN-NEXT:    slli a5, a3, 2
-; ZVFHMIN-NEXT:    sub a1, a2, a5
-; ZVFHMIN-NEXT:    sltu a4, a2, a1
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a6, a4, a1
 ; ZVFHMIN-NEXT:    slli a4, a3, 1
+; ZVFHMIN-NEXT:    add a1, a0, a1
+; ZVFHMIN-NEXT:    sub a6, a2, a5
+; ZVFHMIN-NEXT:    vl8re16.v v0, (a1)
+; ZVFHMIN-NEXT:    sltu a1, a2, a6
+; ZVFHMIN-NEXT:    addi a1, a1, -1
+; ZVFHMIN-NEXT:    and a6, a1, a6
 ; ZVFHMIN-NEXT:    sub a1, a6, a4
 ; ZVFHMIN-NEXT:    sltu a7, a6, a1
 ; ZVFHMIN-NEXT:    addi a7, a7, -1
 ; ZVFHMIN-NEXT:    and a7, a7, a1
 ; ZVFHMIN-NEXT:    srli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    csrr t0, vlenb
+; ZVFHMIN-NEXT:    slli t0, t0, 3
 ; ZVFHMIN-NEXT:    add t0, sp, t0
 ; ZVFHMIN-NEXT:    addi t0, t0, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (t0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs1r.v v24, (t0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli t0, zero, e8, m1, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v8, v0, a1
-; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    addi t0, sp, 16
-; ZVFHMIN-NEXT:    vs1r.v v8, (t0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vslidedown.vx v25, v24, a1
 ; ZVFHMIN-NEXT:    vsetvli t0, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v24, v25, a3
 ; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    mv t0, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add t0, t0, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, t0
+; ZVFHMIN-NEXT:    slli t0, a0, 5
+; ZVFHMIN-NEXT:    add a0, t0, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    slli t0, a0, 3
+; ZVFHMIN-NEXT:    add a0, t0, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vmv4r.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    mv t0, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add a0, a0, t0
+; ZVFHMIN-NEXT:    slli t0, a0, 4
+; ZVFHMIN-NEXT:    add a0, t0, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v24
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a7, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v26, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v6, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    bltu a6, a4, .LBB171_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a6, a4
 ; ZVFHMIN-NEXT:  .LBB171_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v25
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    slli a7, a0, 3
+; ZVFHMIN-NEXT:    add a0, a7, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    mv a7, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add a0, a0, a7
+; ZVFHMIN-NEXT:    slli a7, a0, 4
+; ZVFHMIN-NEXT:    add a0, a7, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, a6, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v6, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v5, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    add a0, a3, a3
 ; ZVFHMIN-NEXT:    bltu a2, a5, .LBB171_4
 ; ZVFHMIN-NEXT:  # %bb.3:
 ; ZVFHMIN-NEXT:    mv a2, a5
 ; ZVFHMIN-NEXT:  .LBB171_4:
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v6, v26, a3
 ; ZVFHMIN-NEXT:    sub a5, a2, a4
-; ZVFHMIN-NEXT:    sltu a6, a2, a5
-; ZVFHMIN-NEXT:    addi a6, a6, -1
-; ZVFHMIN-NEXT:    and a5, a6, a5
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
+; ZVFHMIN-NEXT:    slli a6, a6, 3
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl1r.v v8, (a6) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vmv1r.v v7, v8
+; ZVFHMIN-NEXT:    vl1r.v v7, (a6) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a6, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v8, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a6, a6, 1
 ; ZVFHMIN-NEXT:    mv a7, a6
 ; ZVFHMIN-NEXT:    slli a6, a6, 3
+; ZVFHMIN-NEXT:    add a7, a7, a6
+; ZVFHMIN-NEXT:    slli a6, a6, 1
 ; ZVFHMIN-NEXT:    add a6, a6, a7
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a6, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a6, a6, 1
-; ZVFHMIN-NEXT:    mv a7, a6
-; ZVFHMIN-NEXT:    slli a6, a6, 2
-; ZVFHMIN-NEXT:    add a6, a6, a7
+; ZVFHMIN-NEXT:    slli a7, a6, 4
+; ZVFHMIN-NEXT:    add a6, a7, a6
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a6, a6, 1
-; ZVFHMIN-NEXT:    mv a7, a6
-; ZVFHMIN-NEXT:    slli a6, a6, 2
-; ZVFHMIN-NEXT:    add a7, a7, a6
-; ZVFHMIN-NEXT:    slli a6, a6, 1
-; ZVFHMIN-NEXT:    add a6, a6, a7
+; ZVFHMIN-NEXT:    slli a7, a6, 5
+; ZVFHMIN-NEXT:    add a6, a7, a6
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    sltu a6, a2, a5
+; ZVFHMIN-NEXT:    addi a6, a6, -1
+; ZVFHMIN-NEXT:    and a5, a6, a5
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a6, a6, 1
-; ZVFHMIN-NEXT:    mv a7, a6
-; ZVFHMIN-NEXT:    slli a6, a6, 2
-; ZVFHMIN-NEXT:    add a6, a6, a7
+; ZVFHMIN-NEXT:    slli a7, a6, 4
+; ZVFHMIN-NEXT:    add a6, a7, a6
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v5, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v4, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslideup.vx v5, v6, a3
 ; ZVFHMIN-NEXT:    bltu a2, a4, .LBB171_6
 ; ZVFHMIN-NEXT:  # %bb.5:
 ; ZVFHMIN-NEXT:    mv a2, a4
 ; ZVFHMIN-NEXT:  .LBB171_6:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 2
-; ZVFHMIN-NEXT:    add a5, a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
+; ZVFHMIN-NEXT:    slli a5, a4, 5
+; ZVFHMIN-NEXT:    add a4, a5, a4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v8, v5, a3
+; ZVFHMIN-NEXT:    vslideup.vx v8, v4, a3
 ; ZVFHMIN-NEXT:    add a0, a1, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v8, v6, a1
+; ZVFHMIN-NEXT:    vslideup.vx v8, v5, a1
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a1, a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
index 23d73481aed2d7..7ef46a7372e22b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
@@ -420,9 +420,9 @@ define <vscale x 8 x i1> @fcmp_one_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v12, v16
 ; CHECK-NEXT:    vmflt.vv v9, v16, v12
@@ -439,9 +439,9 @@ define <vscale x 8 x i1> @fcmp_one_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v16, v12
 ; CHECK-NEXT:    vmflt.vv v9, v12, v16
@@ -505,15 +505,15 @@ define <vscale x 8 x i1> @fcmp_ord_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
+; CHECK-NEXT:    vmfeq.vv v10, v12, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v9, v12, v12
-; CHECK-NEXT:    vmand.mm v0, v8, v9
+; CHECK-NEXT:    vmfeq.vv v8, v12, v12
+; CHECK-NEXT:    vmand.mm v0, v10, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -526,15 +526,15 @@ define <vscale x 8 x i1> @fcmp_ord_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
+; CHECK-NEXT:    vmfeq.vv v10, v12, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v9, v12, v12
-; CHECK-NEXT:    vmand.mm v0, v9, v8
+; CHECK-NEXT:    vmfeq.vv v8, v12, v12
+; CHECK-NEXT:    vmand.mm v0, v8, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -564,15 +564,15 @@ define <vscale x 8 x i1> @fcmp_ord_vf_nxv8bf16_nonans(<vscale x 8 x bfloat> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v12, v12
+; CHECK-NEXT:    vmfeq.vv v10, v12, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmfeq.vv v9, v12, v12
-; CHECK-NEXT:    vmand.mm v0, v8, v9
+; CHECK-NEXT:    vmfeq.vv v8, v12, v12
+; CHECK-NEXT:    vmand.mm v0, v10, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -600,9 +600,9 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v12, v16
 ; CHECK-NEXT:    vmflt.vv v9, v16, v12
@@ -619,9 +619,9 @@ define <vscale x 8 x i1> @fcmp_ueq_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vmflt.vv v8, v16, v12
 ; CHECK-NEXT:    vmflt.vv v9, v12, v16
@@ -1082,15 +1082,15 @@ define <vscale x 8 x i1> @fcmp_uno_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v8, v12, v12
+; CHECK-NEXT:    vmfne.vv v10, v12, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v9, v12, v12
-; CHECK-NEXT:    vmor.mm v0, v8, v9
+; CHECK-NEXT:    vmfne.vv v8, v12, v12
+; CHECK-NEXT:    vmor.mm v0, v10, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -1103,15 +1103,15 @@ define <vscale x 8 x i1> @fcmp_uno_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v8, v12, v12
+; CHECK-NEXT:    vmfne.vv v10, v12, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v9, v12, v12
-; CHECK-NEXT:    vmor.mm v0, v9, v8
+; CHECK-NEXT:    vmfne.vv v8, v12, v12
+; CHECK-NEXT:    vmor.mm v0, v8, v10
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -1141,15 +1141,15 @@ define <vscale x 8 x i1> @fcmp_uno_vf_nxv8bf16_nonans(<vscale x 8 x bfloat> %va,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v8, v12, v12
+; CHECK-NEXT:    vmfne.vv v10, v12, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vmfne.vv v9, v12, v12
-; CHECK-NEXT:    vmor.mm v0, v8, v9
+; CHECK-NEXT:    vmfne.vv v8, v12, v12
+; CHECK-NEXT:    vmor.mm v0, v10, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -1728,9 +1728,9 @@ define <vscale x 8 x i1> @fcmp_one_vf_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v16, v12
@@ -1755,9 +1755,9 @@ define <vscale x 8 x i1> @fcmp_one_fv_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v16
@@ -1850,15 +1850,15 @@ define <vscale x 8 x i1> @fcmp_ord_vf_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
+; ZVFHMIN-NEXT:    vmfeq.vv v10, v12, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v9, v12, v12
-; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
+; ZVFHMIN-NEXT:    vmand.mm v0, v10, v8
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -1880,15 +1880,15 @@ define <vscale x 8 x i1> @fcmp_ord_fv_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
+; ZVFHMIN-NEXT:    vmfeq.vv v10, v12, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v9, v12, v12
-; ZVFHMIN-NEXT:    vmand.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
+; ZVFHMIN-NEXT:    vmand.mm v0, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -1935,15 +1935,15 @@ define <vscale x 8 x i1> @fcmp_ord_vf_nxv8f16_nonans(<vscale x 8 x half> %va, ha
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
+; ZVFHMIN-NEXT:    vmfeq.vv v10, v12, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v9, v12, v12
-; ZVFHMIN-NEXT:    vmand.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
+; ZVFHMIN-NEXT:    vmand.mm v0, v10, v8
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -1987,9 +1987,9 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v12, v16
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v16, v12
@@ -2014,9 +2014,9 @@ define <vscale x 8 x i1> @fcmp_ueq_fv_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmflt.vv v8, v16, v12
 ; ZVFHMIN-NEXT:    vmflt.vv v9, v12, v16
@@ -2668,15 +2668,15 @@ define <vscale x 8 x i1> @fcmp_uno_vf_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12
+; ZVFHMIN-NEXT:    vmfne.vv v10, v12, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v9, v12, v12
-; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12
+; ZVFHMIN-NEXT:    vmor.mm v0, v10, v8
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -2698,15 +2698,15 @@ define <vscale x 8 x i1> @fcmp_uno_fv_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12
+; ZVFHMIN-NEXT:    vmfne.vv v10, v12, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v9, v12, v12
-; ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
+; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12
+; ZVFHMIN-NEXT:    vmor.mm v0, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -2753,15 +2753,15 @@ define <vscale x 8 x i1> @fcmp_uno_vf_nxv8f16_nonans(<vscale x 8 x half> %va, ha
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12
+; ZVFHMIN-NEXT:    vmfne.vv v10, v12, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vmfne.vv v9, v12, v12
-; ZVFHMIN-NEXT:    vmor.mm v0, v8, v9
+; ZVFHMIN-NEXT:    vmfne.vv v8, v12, v12
+; ZVFHMIN-NEXT:    vmor.mm v0, v10, v8
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -4443,10 +4443,10 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; RV32-LABEL: fcmp_oeq_vf_nx16f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    fcvt.d.w fa5, zero
-; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vmfeq.vf v24, v16, fa5
 ; RV32-NEXT:    vmfeq.vf v0, v8, fa5
-; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    srli a0, a0, 3
 ; RV32-NEXT:    add a1, a0, a0
 ; RV32-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
@@ -4456,10 +4456,10 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; RV64-LABEL: fcmp_oeq_vf_nx16f64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    fmv.d.x fa5, zero
-; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vmfeq.vf v24, v16, fa5
 ; RV64-NEXT:    vmfeq.vf v0, v8, fa5
-; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    srli a0, a0, 3
 ; RV64-NEXT:    add a1, a0, a0
 ; RV64-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
@@ -4469,10 +4469,10 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; ZVFHMIN32-LABEL: fcmp_oeq_vf_nx16f64:
 ; ZVFHMIN32:       # %bb.0:
 ; ZVFHMIN32-NEXT:    fcvt.d.w fa5, zero
-; ZVFHMIN32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; ZVFHMIN32-NEXT:    vmfeq.vf v24, v16, fa5
 ; ZVFHMIN32-NEXT:    vmfeq.vf v0, v8, fa5
-; ZVFHMIN32-NEXT:    csrr a0, vlenb
 ; ZVFHMIN32-NEXT:    srli a0, a0, 3
 ; ZVFHMIN32-NEXT:    add a1, a0, a0
 ; ZVFHMIN32-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
@@ -4482,10 +4482,10 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; ZVFHMIN64-LABEL: fcmp_oeq_vf_nx16f64:
 ; ZVFHMIN64:       # %bb.0:
 ; ZVFHMIN64-NEXT:    fmv.d.x fa5, zero
-; ZVFHMIN64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; ZVFHMIN64-NEXT:    vmfeq.vf v24, v16, fa5
 ; ZVFHMIN64-NEXT:    vmfeq.vf v0, v8, fa5
-; ZVFHMIN64-NEXT:    csrr a0, vlenb
 ; ZVFHMIN64-NEXT:    srli a0, a0, 3
 ; ZVFHMIN64-NEXT:    add a1, a0, a0
 ; ZVFHMIN64-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index 8039aa368b7ccb..e8099c2d08a5f8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -558,8 +558,8 @@ define <vscale x 8 x i1> @icmp_eq_vx_nxv8i7(<vscale x 8 x i7> %va, i7 %b, <vscal
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmseq.vv v0, v8, v9, v0.t
@@ -575,8 +575,8 @@ define <vscale x 8 x i1> @icmp_eq_vx_swap_nxv8i7(<vscale x 8 x i7> %va, i7 %b, <
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vmseq.vv v0, v9, v8, v0.t
@@ -1099,18 +1099,18 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a4, a0, a1
-; CHECK-NEXT:    vl8r.v v8, (a4)
 ; CHECK-NEXT:    vsetvli a4, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a2)
-; CHECK-NEXT:    sub a2, a3, a1
-; CHECK-NEXT:    sltu a4, a3, a2
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    sub a4, a3, a1
+; CHECK-NEXT:    vl8r.v v8, (a2)
+; CHECK-NEXT:    sltu a2, a3, a4
 ; CHECK-NEXT:    vl8r.v v24, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a2, a4, a2
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a4
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vv v6, v16, v8, v0.t
 ; CHECK-NEXT:    bltu a3, a1, .LBB96_2
@@ -2253,18 +2253,18 @@ define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    srli a1, a3, 2
 ; CHECK-NEXT:    slli a4, a3, 3
+; CHECK-NEXT:    slli a3, a3, 1
 ; CHECK-NEXT:    add a4, a0, a4
+; CHECK-NEXT:    sub a5, a2, a3
 ; CHECK-NEXT:    vl8re32.v v8, (a4)
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    sub a4, a2, a3
-; CHECK-NEXT:    sltu a5, a2, a4
-; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    sltu a4, a2, a5
+; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    vl8re32.v v0, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a1
-; CHECK-NEXT:    and a4, a5, a4
+; CHECK-NEXT:    and a4, a4, a5
 ; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vv v7, v16, v8, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB189_2
@@ -2302,9 +2302,9 @@ define <vscale x 32 x i1> @icmp_eq_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b,
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    srli a2, a3, 2
+; CHECK-NEXT:    slli a3, a3, 1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a3, a3, 1
 ; CHECK-NEXT:    sub a4, a1, a3
 ; CHECK-NEXT:    sltu a5, a1, a4
 ; CHECK-NEXT:    addi a5, a5, -1
@@ -2335,9 +2335,9 @@ define <vscale x 32 x i1> @icmp_eq_vx_swap_nxv32i32(<vscale x 32 x i32> %va, i32
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    srli a2, a3, 2
+; CHECK-NEXT:    slli a3, a3, 1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a3, a3, 1
 ; CHECK-NEXT:    sub a4, a1, a3
 ; CHECK-NEXT:    sltu a5, a1, a4
 ; CHECK-NEXT:    addi a5, a5, -1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
index 1910953307e5df..bd3c29b0c6efca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
@@ -2998,10 +2998,10 @@ define <vscale x 16 x i1> @icmp_eq_vi_nx16i64(<vscale x 16 x i64> %va) {
 ; CHECK-LABEL: icmp_eq_vi_nx16i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vmseq.vi v24, v16, 0
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmseq.vi v24, v16, 0
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v24, a0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
index abf89361cdea53..c91b02e8f15e47 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll
@@ -244,34 +244,34 @@ define void @sink_splat_mul_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_mul_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB7_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB7_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB7_5
 ; CHECK-NEXT:  .LBB7_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:  .LBB7_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    vmul.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB7_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB7_7
 ; CHECK-NEXT:  .LBB7_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:  .LBB7_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -335,34 +335,34 @@ define void @sink_splat_add_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_add_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB8_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB8_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB8_5
 ; CHECK-NEXT:  .LBB8_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:  .LBB8_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    vadd.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB8_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB8_7
 ; CHECK-NEXT:  .LBB8_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:  .LBB8_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -426,34 +426,34 @@ define void @sink_splat_sub_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_sub_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB9_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB9_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB9_5
 ; CHECK-NEXT:  .LBB9_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:  .LBB9_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    vsub.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB9_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB9_7
 ; CHECK-NEXT:  .LBB9_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:  .LBB9_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -517,34 +517,34 @@ define void @sink_splat_rsub_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_rsub_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB10_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB10_5
 ; CHECK-NEXT:  .LBB10_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:  .LBB10_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB10_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB10_7
 ; CHECK-NEXT:  .LBB10_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:  .LBB10_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -608,34 +608,34 @@ define void @sink_splat_and_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_and_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB11_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB11_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB11_5
 ; CHECK-NEXT:  .LBB11_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:  .LBB11_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    vand.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB11_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB11_7
 ; CHECK-NEXT:  .LBB11_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:  .LBB11_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -699,34 +699,34 @@ define void @sink_splat_or_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_or_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB12_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB12_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB12_5
 ; CHECK-NEXT:  .LBB12_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:  .LBB12_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    vor.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB12_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB12_7
 ; CHECK-NEXT:  .LBB12_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:  .LBB12_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -790,34 +790,34 @@ define void @sink_splat_xor_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_xor_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB13_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB13_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB13_5
 ; CHECK-NEXT:  .LBB13_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:  .LBB13_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    vxor.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB13_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB13_7
 ; CHECK-NEXT:  .LBB13_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:  .LBB13_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -983,34 +983,34 @@ define void @sink_splat_shl_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_shl_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB17_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB17_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB17_5
 ; CHECK-NEXT:  .LBB17_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:  .LBB17_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    vsll.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB17_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB17_7
 ; CHECK-NEXT:  .LBB17_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:  .LBB17_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1074,34 +1074,34 @@ define void @sink_splat_lshr_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_lshr_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB18_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB18_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB18_5
 ; CHECK-NEXT:  .LBB18_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:  .LBB18_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    vsrl.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB18_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB18_7
 ; CHECK-NEXT:  .LBB18_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:  .LBB18_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1182,17 +1182,17 @@ define void @sink_splat_ashr_scalable(ptr nocapture %a) {
 ; CHECK-NEXT:  .LBB19_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a5)
+; CHECK-NEXT:    sub a6, a6, a2
 ; CHECK-NEXT:    vsra.vi v8, v8, 2
 ; CHECK-NEXT:    vs2r.v v8, (a5)
-; CHECK-NEXT:    sub a6, a6, a2
 ; CHECK-NEXT:    add a5, a5, a4
 ; CHECK-NEXT:    bnez a6, .LBB19_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a3, .LBB19_7
 ; CHECK-NEXT:  .LBB19_5: # %for.body.preheader
 ; CHECK-NEXT:    slli a1, a1, 2
-; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB19_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1458,33 +1458,33 @@ define void @sink_splat_fmul_scalable(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_fmul_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB26_2
+; CHECK-NEXT:    srli a3, a1, 2
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB26_5
 ; CHECK-NEXT:  .LBB26_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    mv a6, a3
+; CHECK-NEXT:    mv a6, a2
 ; CHECK-NEXT:    vsetvli a7, zero, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB26_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1re32.v v8, (a5)
+; CHECK-NEXT:    sub a6, a6, a3
 ; CHECK-NEXT:    vfmul.vf v8, v8, fa0
 ; CHECK-NEXT:    vs1r.v v8, (a5)
-; CHECK-NEXT:    sub a6, a6, a2
 ; CHECK-NEXT:    add a5, a5, a1
 ; CHECK-NEXT:    bnez a6, .LBB26_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB26_7
 ; CHECK-NEXT:  .LBB26_5: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a3, 2
-; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    slli a1, a2, 2
 ; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB26_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1548,33 +1548,33 @@ define void @sink_splat_fdiv_scalable(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_fdiv_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB27_2
+; CHECK-NEXT:    srli a3, a1, 2
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB27_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB27_5
 ; CHECK-NEXT:  .LBB27_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    mv a6, a3
+; CHECK-NEXT:    mv a6, a2
 ; CHECK-NEXT:    vsetvli a7, zero, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB27_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1re32.v v8, (a5)
+; CHECK-NEXT:    sub a6, a6, a3
 ; CHECK-NEXT:    vfdiv.vf v8, v8, fa0
 ; CHECK-NEXT:    vs1r.v v8, (a5)
-; CHECK-NEXT:    sub a6, a6, a2
 ; CHECK-NEXT:    add a5, a5, a1
 ; CHECK-NEXT:    bnez a6, .LBB27_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB27_7
 ; CHECK-NEXT:  .LBB27_5: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a3, 2
-; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    slli a1, a2, 2
 ; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB27_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1638,33 +1638,33 @@ define void @sink_splat_frdiv_scalable(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_frdiv_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB28_2
+; CHECK-NEXT:    srli a3, a1, 2
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB28_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB28_5
 ; CHECK-NEXT:  .LBB28_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    mv a6, a3
+; CHECK-NEXT:    mv a6, a2
 ; CHECK-NEXT:    vsetvli a7, zero, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB28_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1re32.v v8, (a5)
+; CHECK-NEXT:    sub a6, a6, a3
 ; CHECK-NEXT:    vfrdiv.vf v8, v8, fa0
 ; CHECK-NEXT:    vs1r.v v8, (a5)
-; CHECK-NEXT:    sub a6, a6, a2
 ; CHECK-NEXT:    add a5, a5, a1
 ; CHECK-NEXT:    bnez a6, .LBB28_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB28_7
 ; CHECK-NEXT:  .LBB28_5: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a3, 2
-; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    slli a1, a2, 2
 ; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB28_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1728,33 +1728,33 @@ define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_fadd_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB29_2
+; CHECK-NEXT:    srli a3, a1, 2
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB29_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB29_5
 ; CHECK-NEXT:  .LBB29_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    mv a6, a3
+; CHECK-NEXT:    mv a6, a2
 ; CHECK-NEXT:    vsetvli a7, zero, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB29_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1re32.v v8, (a5)
+; CHECK-NEXT:    sub a6, a6, a3
 ; CHECK-NEXT:    vfadd.vf v8, v8, fa0
 ; CHECK-NEXT:    vs1r.v v8, (a5)
-; CHECK-NEXT:    sub a6, a6, a2
 ; CHECK-NEXT:    add a5, a5, a1
 ; CHECK-NEXT:    bnez a6, .LBB29_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB29_7
 ; CHECK-NEXT:  .LBB29_5: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a3, 2
-; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    slli a1, a2, 2
 ; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB29_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1818,33 +1818,33 @@ define void @sink_splat_fsub_scalable(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_fsub_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB30_2
+; CHECK-NEXT:    srli a3, a1, 2
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB30_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB30_5
 ; CHECK-NEXT:  .LBB30_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    mv a6, a3
+; CHECK-NEXT:    mv a6, a2
 ; CHECK-NEXT:    vsetvli a7, zero, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB30_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1re32.v v8, (a5)
+; CHECK-NEXT:    sub a6, a6, a3
 ; CHECK-NEXT:    vfsub.vf v8, v8, fa0
 ; CHECK-NEXT:    vs1r.v v8, (a5)
-; CHECK-NEXT:    sub a6, a6, a2
 ; CHECK-NEXT:    add a5, a5, a1
 ; CHECK-NEXT:    bnez a6, .LBB30_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB30_7
 ; CHECK-NEXT:  .LBB30_5: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a3, 2
-; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    slli a1, a2, 2
 ; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB30_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1908,33 +1908,33 @@ define void @sink_splat_frsub_scalable(ptr nocapture %a, float %x) {
 ; CHECK-LABEL: sink_splat_frsub_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB31_2
+; CHECK-NEXT:    srli a3, a1, 2
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB31_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB31_5
 ; CHECK-NEXT:  .LBB31_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    mv a5, a0
-; CHECK-NEXT:    mv a6, a3
+; CHECK-NEXT:    mv a6, a2
 ; CHECK-NEXT:    vsetvli a7, zero, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB31_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1re32.v v8, (a5)
+; CHECK-NEXT:    sub a6, a6, a3
 ; CHECK-NEXT:    vfrsub.vf v8, v8, fa0
 ; CHECK-NEXT:    vs1r.v v8, (a5)
-; CHECK-NEXT:    sub a6, a6, a2
 ; CHECK-NEXT:    add a5, a5, a1
 ; CHECK-NEXT:    bnez a6, .LBB31_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB31_7
 ; CHECK-NEXT:  .LBB31_5: # %for.body.preheader
-; CHECK-NEXT:    slli a1, a3, 2
-; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    slli a1, a2, 2
 ; CHECK-NEXT:    lui a2, 1
+; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:  .LBB31_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -2004,9 +2004,9 @@ define void @sink_splat_fma(ptr noalias nocapture %a, ptr nocapture readonly %b,
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vle32.v v9, (a1)
+; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vfmacc.vf v9, fa0, v8
 ; CHECK-NEXT:    vse32.v v9, (a0)
-; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    bne a1, a2, .LBB32_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -2042,9 +2042,9 @@ define void @sink_splat_fma_commute(ptr noalias nocapture %a, ptr nocapture read
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vle32.v v9, (a1)
+; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vfmacc.vf v9, fa0, v8
 ; CHECK-NEXT:    vse32.v v9, (a0)
-; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    bne a1, a2, .LBB33_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -2074,45 +2074,45 @@ define void @sink_splat_fma_scalable(ptr noalias nocapture %a, ptr noalias nocap
 ; CHECK-LABEL: sink_splat_fma_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    srli a3, a2, 2
-; CHECK-NEXT:    li a4, 1024
-; CHECK-NEXT:    bgeu a4, a3, .LBB34_2
+; CHECK-NEXT:    srli a4, a2, 2
+; CHECK-NEXT:    li a3, 1024
+; CHECK-NEXT:    bgeu a3, a4, .LBB34_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a4, 0
+; CHECK-NEXT:    li a3, 0
 ; CHECK-NEXT:    j .LBB34_5
 ; CHECK-NEXT:  .LBB34_2: # %vector.ph
-; CHECK-NEXT:    addi a4, a3, -1
-; CHECK-NEXT:    andi a5, a4, 1024
-; CHECK-NEXT:    xori a4, a5, 1024
+; CHECK-NEXT:    addi a3, a4, -1
+; CHECK-NEXT:    andi a5, a3, 1024
+; CHECK-NEXT:    xori a3, a5, 1024
 ; CHECK-NEXT:    mv a6, a0
 ; CHECK-NEXT:    mv a7, a1
-; CHECK-NEXT:    mv t0, a4
+; CHECK-NEXT:    mv t0, a3
 ; CHECK-NEXT:    vsetvli t1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB34_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1re32.v v8, (a6)
 ; CHECK-NEXT:    vl1re32.v v9, (a7)
+; CHECK-NEXT:    sub t0, t0, a4
+; CHECK-NEXT:    add a7, a7, a2
 ; CHECK-NEXT:    vfmacc.vf v9, fa0, v8
 ; CHECK-NEXT:    vs1r.v v9, (a6)
-; CHECK-NEXT:    sub t0, t0, a3
-; CHECK-NEXT:    add a7, a7, a2
 ; CHECK-NEXT:    add a6, a6, a2
 ; CHECK-NEXT:    bnez t0, .LBB34_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a5, .LBB34_7
 ; CHECK-NEXT:  .LBB34_5: # %for.body.preheader
-; CHECK-NEXT:    slli a4, a4, 2
-; CHECK-NEXT:    add a2, a1, a4
-; CHECK-NEXT:    add a0, a0, a4
+; CHECK-NEXT:    slli a2, a3, 2
 ; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a2, a1, a2
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:  .LBB34_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    flw fa5, 0(a0)
 ; CHECK-NEXT:    flw fa4, 0(a2)
+; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    fmadd.s fa5, fa5, fa0, fa4
 ; CHECK-NEXT:    fsw fa5, 0(a0)
-; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    addi a0, a0, 4
 ; CHECK-NEXT:    bne a2, a1, .LBB34_6
 ; CHECK-NEXT:  .LBB34_7: # %for.cond.cleanup
@@ -2174,45 +2174,45 @@ define void @sink_splat_fma_commute_scalable(ptr noalias nocapture %a, ptr noali
 ; CHECK-LABEL: sink_splat_fma_commute_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    srli a3, a2, 2
-; CHECK-NEXT:    li a4, 1024
-; CHECK-NEXT:    bgeu a4, a3, .LBB35_2
+; CHECK-NEXT:    srli a4, a2, 2
+; CHECK-NEXT:    li a3, 1024
+; CHECK-NEXT:    bgeu a3, a4, .LBB35_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a4, 0
+; CHECK-NEXT:    li a3, 0
 ; CHECK-NEXT:    j .LBB35_5
 ; CHECK-NEXT:  .LBB35_2: # %vector.ph
-; CHECK-NEXT:    addi a4, a3, -1
-; CHECK-NEXT:    andi a5, a4, 1024
-; CHECK-NEXT:    xori a4, a5, 1024
+; CHECK-NEXT:    addi a3, a4, -1
+; CHECK-NEXT:    andi a5, a3, 1024
+; CHECK-NEXT:    xori a3, a5, 1024
 ; CHECK-NEXT:    mv a6, a0
 ; CHECK-NEXT:    mv a7, a1
-; CHECK-NEXT:    mv t0, a4
+; CHECK-NEXT:    mv t0, a3
 ; CHECK-NEXT:    vsetvli t1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:  .LBB35_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1re32.v v8, (a6)
 ; CHECK-NEXT:    vl1re32.v v9, (a7)
+; CHECK-NEXT:    sub t0, t0, a4
+; CHECK-NEXT:    add a7, a7, a2
 ; CHECK-NEXT:    vfmacc.vf v9, fa0, v8
 ; CHECK-NEXT:    vs1r.v v9, (a6)
-; CHECK-NEXT:    sub t0, t0, a3
-; CHECK-NEXT:    add a7, a7, a2
 ; CHECK-NEXT:    add a6, a6, a2
 ; CHECK-NEXT:    bnez t0, .LBB35_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a5, .LBB35_7
 ; CHECK-NEXT:  .LBB35_5: # %for.body.preheader
-; CHECK-NEXT:    slli a4, a4, 2
-; CHECK-NEXT:    add a2, a1, a4
-; CHECK-NEXT:    add a0, a0, a4
+; CHECK-NEXT:    slli a2, a3, 2
 ; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a2, a1, a2
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:  .LBB35_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    flw fa5, 0(a0)
 ; CHECK-NEXT:    flw fa4, 0(a2)
+; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    fmadd.s fa5, fa0, fa5, fa4
 ; CHECK-NEXT:    fsw fa5, 0(a0)
-; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    addi a0, a0, 4
 ; CHECK-NEXT:    bne a2, a1, .LBB35_6
 ; CHECK-NEXT:  .LBB35_7: # %for.cond.cleanup
@@ -2487,34 +2487,34 @@ define void @sink_splat_udiv_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_udiv_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB42_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB42_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB42_5
 ; CHECK-NEXT:  .LBB42_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:  .LBB42_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    vdivu.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB42_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB42_7
 ; CHECK-NEXT:  .LBB42_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:  .LBB42_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -2578,34 +2578,34 @@ define void @sink_splat_sdiv_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_sdiv_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB43_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB43_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB43_5
 ; CHECK-NEXT:  .LBB43_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:  .LBB43_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    vdiv.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB43_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB43_7
 ; CHECK-NEXT:  .LBB43_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:  .LBB43_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -2669,34 +2669,34 @@ define void @sink_splat_urem_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_urem_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB44_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB44_5
 ; CHECK-NEXT:  .LBB44_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:  .LBB44_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    vremu.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB44_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB44_7
 ; CHECK-NEXT:  .LBB44_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:  .LBB44_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -2760,34 +2760,34 @@ define void @sink_splat_srem_scalable(ptr nocapture %a, i32 signext %x) {
 ; CHECK-LABEL: sink_splat_srem_scalable:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    srli a2, a5, 1
-; CHECK-NEXT:    li a3, 1024
-; CHECK-NEXT:    bgeu a3, a2, .LBB45_2
+; CHECK-NEXT:    srli a3, a5, 1
+; CHECK-NEXT:    li a2, 1024
+; CHECK-NEXT:    bgeu a2, a3, .LBB45_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    li a3, 0
+; CHECK-NEXT:    li a2, 0
 ; CHECK-NEXT:    j .LBB45_5
 ; CHECK-NEXT:  .LBB45_2: # %vector.ph
-; CHECK-NEXT:    addi a3, a2, -1
-; CHECK-NEXT:    andi a4, a3, 1024
-; CHECK-NEXT:    xori a3, a4, 1024
+; CHECK-NEXT:    addi a2, a3, -1
+; CHECK-NEXT:    andi a4, a2, 1024
+; CHECK-NEXT:    xori a2, a4, 1024
 ; CHECK-NEXT:    slli a5, a5, 1
 ; CHECK-NEXT:    mv a6, a0
-; CHECK-NEXT:    mv a7, a3
+; CHECK-NEXT:    mv a7, a2
 ; CHECK-NEXT:    vsetvli t0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:  .LBB45_3: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl2re32.v v8, (a6)
+; CHECK-NEXT:    sub a7, a7, a3
 ; CHECK-NEXT:    vrem.vx v8, v8, a1
 ; CHECK-NEXT:    vs2r.v v8, (a6)
-; CHECK-NEXT:    sub a7, a7, a2
 ; CHECK-NEXT:    add a6, a6, a5
 ; CHECK-NEXT:    bnez a7, .LBB45_3
 ; CHECK-NEXT:  # %bb.4: # %middle.block
 ; CHECK-NEXT:    beqz a4, .LBB45_7
 ; CHECK-NEXT:  .LBB45_5: # %for.body.preheader
-; CHECK-NEXT:    slli a2, a3, 2
-; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    lui a3, 1
+; CHECK-NEXT:    add a2, a0, a2
 ; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:  .LBB45_6: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -2857,9 +2857,9 @@ define void @sink_splat_min(ptr nocapture %a, i32 signext %x) {
 ; CHECK-NEXT:  .LBB46_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    vmin.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    addi a0, a0, -16
 ; CHECK-NEXT:    bnez a2, .LBB46_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -2891,9 +2891,9 @@ define void @sink_splat_min_commute(ptr nocapture %a, i32 signext %x) {
 ; CHECK-NEXT:  .LBB47_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    vmin.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    addi a0, a0, -16
 ; CHECK-NEXT:    bnez a2, .LBB47_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -2927,9 +2927,9 @@ define void @sink_splat_max(ptr nocapture %a, i32 signext %x) {
 ; CHECK-NEXT:  .LBB48_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    vmax.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    addi a0, a0, -16
 ; CHECK-NEXT:    bnez a2, .LBB48_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -2961,9 +2961,9 @@ define void @sink_splat_max_commute(ptr nocapture %a, i32 signext %x) {
 ; CHECK-NEXT:  .LBB49_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    vmax.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    addi a0, a0, -16
 ; CHECK-NEXT:    bnez a2, .LBB49_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -2997,9 +2997,9 @@ define void @sink_splat_umin(ptr nocapture %a, i32 signext %x) {
 ; CHECK-NEXT:  .LBB50_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    addi a0, a0, -16
 ; CHECK-NEXT:    bnez a2, .LBB50_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -3031,9 +3031,9 @@ define void @sink_splat_umin_commute(ptr nocapture %a, i32 signext %x) {
 ; CHECK-NEXT:  .LBB51_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    vminu.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    addi a0, a0, -16
 ; CHECK-NEXT:    bnez a2, .LBB51_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -3067,9 +3067,9 @@ define void @sink_splat_umax(ptr nocapture %a, i32 signext %x) {
 ; CHECK-NEXT:  .LBB52_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    vmaxu.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    addi a0, a0, -16
 ; CHECK-NEXT:    bnez a2, .LBB52_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -3101,9 +3101,9 @@ define void @sink_splat_umax_commute(ptr nocapture %a, i32 signext %x) {
 ; CHECK-NEXT:  .LBB53_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    vmaxu.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    addi a0, a0, -16
 ; CHECK-NEXT:    bnez a2, .LBB53_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -3207,9 +3207,9 @@ define void @sink_splat_ssub_sat(ptr nocapture %a, i32 signext %x) {
 ; CHECK-NEXT:  .LBB56_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    vssub.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    addi a0, a0, -16
 ; CHECK-NEXT:    bnez a2, .LBB56_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -3313,9 +3313,9 @@ define void @sink_splat_usub_sat(ptr nocapture %a, i32 signext %x) {
 ; CHECK-NEXT:  .LBB59_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    vssubu.vx v8, v8, a1
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a2, a2, 4
 ; CHECK-NEXT:    addi a0, a0, -16
 ; CHECK-NEXT:    bnez a2, .LBB59_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -4068,11 +4068,11 @@ define void @sink_splat_vp_fma(ptr noalias nocapture %a, ptr nocapture readonly
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vle32.v v9, (a1)
+; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    bne a1, a3, .LBB79_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -4108,11 +4108,11 @@ define void @sink_splat_vp_fma_commute(ptr noalias nocapture %a, ptr nocapture r
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    vle32.v v9, (a1)
+; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmadd.vf v8, fa0, v9, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    bne a1, a3, .LBB80_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -5284,11 +5284,11 @@ define void @sink_splat_vp_ssub_sat(ptr nocapture %a, i32 signext %x, <4 x i1> %
 ; CHECK-NEXT:  .LBB113_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    addi a3, a3, 4
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
 ; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a3, a3, 4
 ; CHECK-NEXT:    addi a0, a0, -16
 ; CHECK-NEXT:    bnez a3, .LBB113_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
@@ -5396,11 +5396,11 @@ define void @sink_splat_vp_usub_sat(ptr nocapture %a, i32 signext %x, <4 x i1> %
 ; CHECK-NEXT:  .LBB116_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    addi a3, a3, 4
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
 ; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    addi a3, a3, 4
 ; CHECK-NEXT:    addi a0, a0, -16
 ; CHECK-NEXT:    bnez a3, .LBB116_1
 ; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup

diff  --git a/llvm/test/CodeGen/RISCV/rvv/splat-vector-split-i64-vl-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/splat-vector-split-i64-vl-sdnode.ll
index b526822975c87f..26325328e5671c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/splat-vector-split-i64-vl-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/splat-vector-split-i64-vl-sdnode.ll
@@ -12,68 +12,68 @@ define i32 @splat_vector_split_i64() {
 ; CHECK-NEXT:    vmv.v.i v10, 3
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    lui a1, 1044480
+; CHECK-NEXT:    li a2, 56
+; CHECK-NEXT:    li a3, 40
+; CHECK-NEXT:    lui a4, 16
+; CHECK-NEXT:    lui a0, 4080
+; CHECK-NEXT:    addi a5, sp, 8
+; CHECK-NEXT:    sw a1, 8(sp)
+; CHECK-NEXT:    sw zero, 12(sp)
+; CHECK-NEXT:    addi a1, a4, -256
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, tu, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 3
-; CHECK-NEXT:    lui a0, 1044480
-; CHECK-NEXT:    sw a0, 8(sp)
-; CHECK-NEXT:    sw zero, 12(sp)
-; CHECK-NEXT:    li a0, 56
+; CHECK-NEXT:    vsetvli a4, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vlse64.v v10, (a5), zero
+; CHECK-NEXT:    vsrl.vx v12, v8, a2
+; CHECK-NEXT:    vsrl.vx v14, v8, a3
+; CHECK-NEXT:    vsrl.vi v16, v8, 24
+; CHECK-NEXT:    vsll.vx v18, v8, a2
+; CHECK-NEXT:    vand.vx v14, v14, a1
+; CHECK-NEXT:    vor.vv v14, v14, v12
+; CHECK-NEXT:    vand.vx v12, v8, a1
+; CHECK-NEXT:    vsll.vx v12, v12, a3
+; CHECK-NEXT:    vor.vv v12, v18, v12
+; CHECK-NEXT:    vsrl.vi v18, v8, 8
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vand.vv v18, v18, v10
+; CHECK-NEXT:    vor.vv v16, v18, v16
+; CHECK-NEXT:    lui a1, 61681
+; CHECK-NEXT:    lui a2, 209715
+; CHECK-NEXT:    lui a3, 349525
+; CHECK-NEXT:    addi a1, a1, -241
+; CHECK-NEXT:    addi a2, a2, 819
+; CHECK-NEXT:    addi a3, a3, 1365
+; CHECK-NEXT:    vor.vv v14, v16, v14
+; CHECK-NEXT:    vsetvli a4, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmv.v.x v16, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vsrl.vx v10, v8, a0
-; CHECK-NEXT:    li a1, 40
-; CHECK-NEXT:    vsrl.vx v12, v8, a1
-; CHECK-NEXT:    lui a2, 16
-; CHECK-NEXT:    addi a2, a2, -256
-; CHECK-NEXT:    vand.vx v12, v12, a2
-; CHECK-NEXT:    vor.vv v10, v12, v10
-; CHECK-NEXT:    vsrl.vi v12, v8, 24
-; CHECK-NEXT:    addi a3, sp, 8
-; CHECK-NEXT:    vlse64.v v14, (a3), zero
-; CHECK-NEXT:    lui a3, 4080
-; CHECK-NEXT:    vand.vx v12, v12, a3
-; CHECK-NEXT:    vsrl.vi v16, v8, 8
-; CHECK-NEXT:    vand.vv v16, v16, v14
-; CHECK-NEXT:    vor.vv v12, v16, v12
-; CHECK-NEXT:    vor.vv v10, v12, v10
-; CHECK-NEXT:    vand.vv v12, v8, v14
-; CHECK-NEXT:    vsll.vi v12, v12, 8
-; CHECK-NEXT:    vand.vx v14, v8, a3
-; CHECK-NEXT:    vsll.vi v14, v14, 24
-; CHECK-NEXT:    vor.vv v12, v14, v12
-; CHECK-NEXT:    vsll.vx v14, v8, a0
-; CHECK-NEXT:    vand.vx v8, v8, a2
-; CHECK-NEXT:    vsll.vx v8, v8, a1
-; CHECK-NEXT:    vor.vv v8, v14, v8
-; CHECK-NEXT:    vor.vv v8, v8, v12
+; CHECK-NEXT:    vand.vv v10, v8, v10
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    vsll.vi v8, v8, 24
+; CHECK-NEXT:    vsll.vi v10, v10, 8
 ; CHECK-NEXT:    vor.vv v8, v8, v10
-; CHECK-NEXT:    vsrl.vi v10, v8, 4
-; CHECK-NEXT:    lui a0, 61681
-; CHECK-NEXT:    addi a0, a0, -241
-; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmv.v.x v10, a2
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vand.vv v10, v10, v12
-; CHECK-NEXT:    vand.vv v8, v8, v12
-; CHECK-NEXT:    vsll.vi v8, v8, 4
-; CHECK-NEXT:    vor.vv v8, v10, v8
-; CHECK-NEXT:    vsrl.vi v10, v8, 2
-; CHECK-NEXT:    lui a0, 209715
-; CHECK-NEXT:    addi a0, a0, 819
-; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmv.v.x v12, a3
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vand.vv v10, v10, v12
-; CHECK-NEXT:    vand.vv v8, v8, v12
+; CHECK-NEXT:    vor.vv v8, v8, v14
+; CHECK-NEXT:    vsrl.vi v14, v8, 4
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    vand.vv v14, v14, v16
+; CHECK-NEXT:    vsll.vi v8, v8, 4
+; CHECK-NEXT:    vor.vv v8, v14, v8
+; CHECK-NEXT:    vsrl.vi v14, v8, 2
+; CHECK-NEXT:    vand.vv v8, v8, v10
+; CHECK-NEXT:    vand.vv v10, v14, v10
 ; CHECK-NEXT:    vsll.vi v8, v8, 2
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vsrl.vi v10, v8, 1
-; CHECK-NEXT:    lui a0, 349525
-; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
-; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vand.vv v10, v10, v12
 ; CHECK-NEXT:    vand.vv v8, v8, v12
+; CHECK-NEXT:    vand.vv v10, v10, v12
 ; CHECK-NEXT:    vadd.vv v8, v8, v8
 ; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 3

diff  --git a/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll
index 9d0234d2ec2fbc..371055704c0908 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll
@@ -12,8 +12,8 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vmsle.vi v0, v8, -1
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    vsll.vv v10, v8, v9
+; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    vsra.vv v9, v10, v9
 ; CHECK-NEXT:    vmsne.vv v8, v8, v9
 ; CHECK-NEXT:    vmv.v.x v9, a1
@@ -32,14 +32,14 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vmsle.vi v0, v8, -1
 ; CHECK-NEXT:    lui a0, 524288
+; CHECK-NEXT:    vsll.vv v10, v8, v9
 ; CHECK-NEXT:    addi a1, a0, -1
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vsll.vv v11, v8, v9
-; CHECK-NEXT:    vsra.vv v9, v11, v9
+; CHECK-NEXT:    vsra.vv v9, v10, v9
 ; CHECK-NEXT:    vmsne.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vxm v9, v10, a0, v0
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmerge.vxm v9, v9, a0, v0
 ; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v11, v9, v0
+; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; CHECK-NEXT:    ret
   %tmp = call <4 x i32> @llvm.sshl.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %tmp
@@ -51,14 +51,14 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vmsle.vi v0, v8, -1
 ; CHECK-NEXT:    lui a0, 8
+; CHECK-NEXT:    vsll.vv v10, v8, v9
 ; CHECK-NEXT:    addi a1, a0, -1
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vsll.vv v11, v8, v9
-; CHECK-NEXT:    vsra.vv v9, v11, v9
+; CHECK-NEXT:    vsra.vv v9, v10, v9
 ; CHECK-NEXT:    vmsne.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vxm v9, v10, a0, v0
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vmerge.vxm v9, v9, a0, v0
 ; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v11, v9, v0
+; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; CHECK-NEXT:    ret
   %tmp = call <8 x i16> @llvm.sshl.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
   ret <8 x i16> %tmp
@@ -70,14 +70,14 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vmsle.vi v0, v8, -1
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vmv.v.x v10, a0
-; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vsll.vv v11, v8, v9
-; CHECK-NEXT:    vsra.vv v9, v11, v9
+; CHECK-NEXT:    vsll.vv v10, v8, v9
+; CHECK-NEXT:    vsra.vv v9, v10, v9
 ; CHECK-NEXT:    vmsne.vv v8, v8, v9
-; CHECK-NEXT:    vmerge.vxm v9, v10, a0, v0
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    li a0, 128
+; CHECK-NEXT:    vmerge.vxm v9, v9, a0, v0
 ; CHECK-NEXT:    vmv.v.v v0, v8
-; CHECK-NEXT:    vmerge.vvm v8, v11, v9, v0
+; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; CHECK-NEXT:    ret
   %tmp = call <16 x i8> @llvm.sshl.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
   ret <16 x i8> %tmp
@@ -94,8 +94,8 @@ define <vscale x 2 x i64> @vec_nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64>
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vmsle.vi v0, v8, -1
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    vsll.vv v12, v8, v10
+; CHECK-NEXT:    srli a1, a0, 1
 ; CHECK-NEXT:    vsra.vv v14, v12, v10
 ; CHECK-NEXT:    vmsne.vv v10, v8, v14
 ; CHECK-NEXT:    vmv.v.x v8, a1
@@ -114,14 +114,14 @@ define <vscale x 4 x i32> @vec_nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32>
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmsle.vi v0, v8, -1
 ; CHECK-NEXT:    lui a0, 524288
+; CHECK-NEXT:    vsll.vv v12, v8, v10
 ; CHECK-NEXT:    addi a1, a0, -1
-; CHECK-NEXT:    vmv.v.x v12, a1
-; CHECK-NEXT:    vsll.vv v14, v8, v10
-; CHECK-NEXT:    vsra.vv v16, v14, v10
-; CHECK-NEXT:    vmsne.vv v10, v8, v16
-; CHECK-NEXT:    vmerge.vxm v8, v12, a0, v0
+; CHECK-NEXT:    vsra.vv v14, v12, v10
+; CHECK-NEXT:    vmsne.vv v10, v8, v14
+; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vvm v8, v14, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    ret
   %tmp = call <vscale x 4 x i32> @llvm.sshl.sat.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y)
   ret <vscale x 4 x i32> %tmp
@@ -133,14 +133,14 @@ define <vscale x 8 x i16> @vec_nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16>
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmsle.vi v0, v8, -1
 ; CHECK-NEXT:    lui a0, 8
+; CHECK-NEXT:    vsll.vv v12, v8, v10
 ; CHECK-NEXT:    addi a1, a0, -1
-; CHECK-NEXT:    vmv.v.x v12, a1
-; CHECK-NEXT:    vsll.vv v14, v8, v10
-; CHECK-NEXT:    vsra.vv v16, v14, v10
-; CHECK-NEXT:    vmsne.vv v10, v8, v16
-; CHECK-NEXT:    vmerge.vxm v8, v12, a0, v0
+; CHECK-NEXT:    vsra.vv v14, v12, v10
+; CHECK-NEXT:    vmsne.vv v10, v8, v14
+; CHECK-NEXT:    vmv.v.x v8, a1
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vvm v8, v14, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    ret
   %tmp = call <vscale x 8 x i16> @llvm.sshl.sat.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y)
   ret <vscale x 8 x i16> %tmp
@@ -152,14 +152,14 @@ define <vscale x 16 x i8> @vec_nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8>
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmsle.vi v0, v8, -1
 ; CHECK-NEXT:    li a0, 127
-; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vsll.vv v12, v8, v10
+; CHECK-NEXT:    vsra.vv v14, v12, v10
+; CHECK-NEXT:    vmsne.vv v10, v8, v14
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    li a0, 128
-; CHECK-NEXT:    vsll.vv v14, v8, v10
-; CHECK-NEXT:    vsra.vv v16, v14, v10
-; CHECK-NEXT:    vmsne.vv v10, v8, v16
-; CHECK-NEXT:    vmerge.vxm v8, v12, a0, v0
+; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vvm v8, v14, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
 ; CHECK-NEXT:    ret
   %tmp = call <vscale x 16 x i8> @llvm.sshl.sat.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y)
   ret <vscale x 16 x i8> %tmp

diff  --git a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
index 8515b2566cd0ac..62339130678d04 100644
--- a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
@@ -577,9 +577,9 @@ define <vscale x 16 x i64> @add_stepvector_nxv16i64() {
 ; RV64-LABEL: add_stepvector_nxv16i64:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 1
 ; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vid.v v8
+; RV64-NEXT:    slli a0, a0, 1
 ; RV64-NEXT:    vadd.vv v8, v8, v8
 ; RV64-NEXT:    vadd.vx v16, v8, a0
 ; RV64-NEXT:    ret
@@ -616,10 +616,10 @@ define <vscale x 16 x i64> @mul_stepvector_nxv16i64() {
 ; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vid.v v8
 ; RV64-NEXT:    li a0, 3
+; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    vmul.vx v8, v8, a0
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a1, a0, 1
-; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    slli a0, a1, 1
+; RV64-NEXT:    add a0, a0, a1
 ; RV64-NEXT:    vadd.vx v16, v8, a0
 ; RV64-NEXT:    ret
 entry:
@@ -635,26 +635,26 @@ define <vscale x 16 x i64> @mul_bigimm_stepvector_nxv16i64() {
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    li a0, 7
 ; RV32-NEXT:    lui a1, 797989
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    lui a3, 11557
+; RV32-NEXT:    lui a4, 92455
 ; RV32-NEXT:    addi a1, a1, -683
+; RV32-NEXT:    addi a3, a3, -683
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    lui a1, 11557
-; RV32-NEXT:    addi a1, a1, -683
-; RV32-NEXT:    mul a1, a0, a1
-; RV32-NEXT:    srli a0, a0, 3
-; RV32-NEXT:    lui a2, 92455
-; RV32-NEXT:    addi a2, a2, -1368
-; RV32-NEXT:    mulhu a2, a0, a2
+; RV32-NEXT:    srli a0, a2, 3
+; RV32-NEXT:    addi a1, a4, -1368
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    mulhu a1, a0, a1
 ; RV32-NEXT:    slli a3, a0, 1
 ; RV32-NEXT:    slli a0, a0, 6
 ; RV32-NEXT:    sub a0, a0, a3
-; RV32-NEXT:    add a0, a2, a0
-; RV32-NEXT:    sw a1, 0(sp)
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    addi a1, sp, 8
+; RV32-NEXT:    sw a2, 0(sp)
 ; RV32-NEXT:    sw a0, 4(sp)
-; RV32-NEXT:    addi a0, sp, 8
-; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a0), zero
+; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v8, (a1), zero
 ; RV32-NEXT:    mv a0, sp
 ; RV32-NEXT:    vlse64.v v16, (a0), zero
 ; RV32-NEXT:    vid.v v24
@@ -668,12 +668,12 @@ define <vscale x 16 x i64> @mul_bigimm_stepvector_nxv16i64() {
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    lui a1, 1987
+; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; RV64-NEXT:    vid.v v8
 ; RV64-NEXT:    addiw a1, a1, -731
 ; RV64-NEXT:    slli a1, a1, 12
 ; RV64-NEXT:    addi a1, a1, -683
 ; RV64-NEXT:    mul a0, a0, a1
-; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
-; RV64-NEXT:    vid.v v8
 ; RV64-NEXT:    vmul.vx v8, v8, a1
 ; RV64-NEXT:    vadd.vx v16, v8, a0
 ; RV64-NEXT:    ret
@@ -705,9 +705,9 @@ define <vscale x 16 x i64> @shl_stepvector_nxv16i64() {
 ; RV64-LABEL: shl_stepvector_nxv16i64:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 2
 ; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vid.v v8
+; RV64-NEXT:    slli a0, a0, 2
 ; RV64-NEXT:    vsll.vi v8, v8, 2
 ; RV64-NEXT:    vadd.vx v16, v8, a0
 ; RV64-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
index 0e76518f67e16a..f8315de324e42b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
@@ -765,9 +765,9 @@ declare <vscale x 16 x double> @llvm.experimental.vp.strided.load.nxv16f64.p0.i6
 define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vscale x 17 x i1> %mask, i32 zeroext %evl, ptr %hi_ptr) {
 ; CHECK-RV32-LABEL: strided_load_nxv17f64:
 ; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vmv1r.v v8, v0
 ; CHECK-RV32-NEXT:    csrr a2, vlenb
 ; CHECK-RV32-NEXT:    slli a7, a2, 1
-; CHECK-RV32-NEXT:    vmv1r.v v8, v0
 ; CHECK-RV32-NEXT:    mv a6, a3
 ; CHECK-RV32-NEXT:    bltu a3, a7, .LBB57_2
 ; CHECK-RV32-NEXT:  # %bb.1:
@@ -784,12 +784,12 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV32-NEXT:  .LBB57_4:
 ; CHECK-RV32-NEXT:    mul t1, a5, a1
 ; CHECK-RV32-NEXT:    srli t2, a2, 3
+; CHECK-RV32-NEXT:    sub a7, a3, a7
 ; CHECK-RV32-NEXT:    vsetvli t3, zero, e8, mf4, ta, ma
 ; CHECK-RV32-NEXT:    vslidedown.vx v0, v8, t2
 ; CHECK-RV32-NEXT:    add t1, a0, t1
 ; CHECK-RV32-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
 ; CHECK-RV32-NEXT:    vlse64.v v16, (t1), a1, v0.t
-; CHECK-RV32-NEXT:    sub a7, a3, a7
 ; CHECK-RV32-NEXT:    sltu a3, a3, a7
 ; CHECK-RV32-NEXT:    addi a3, a3, -1
 ; CHECK-RV32-NEXT:    and a3, a3, a7
@@ -812,9 +812,9 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ;
 ; CHECK-RV64-LABEL: strided_load_nxv17f64:
 ; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vmv1r.v v8, v0
 ; CHECK-RV64-NEXT:    csrr a4, vlenb
 ; CHECK-RV64-NEXT:    slli a7, a4, 1
-; CHECK-RV64-NEXT:    vmv1r.v v8, v0
 ; CHECK-RV64-NEXT:    mv a6, a2
 ; CHECK-RV64-NEXT:    bltu a2, a7, .LBB57_2
 ; CHECK-RV64-NEXT:  # %bb.1:
@@ -831,12 +831,12 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 ; CHECK-RV64-NEXT:  .LBB57_4:
 ; CHECK-RV64-NEXT:    mul t1, a5, a1
 ; CHECK-RV64-NEXT:    srli t2, a4, 3
+; CHECK-RV64-NEXT:    sub a7, a2, a7
 ; CHECK-RV64-NEXT:    vsetvli t3, zero, e8, mf4, ta, ma
 ; CHECK-RV64-NEXT:    vslidedown.vx v0, v8, t2
 ; CHECK-RV64-NEXT:    add t1, a0, t1
 ; CHECK-RV64-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vlse64.v v16, (t1), a1, v0.t
-; CHECK-RV64-NEXT:    sub a7, a2, a7
 ; CHECK-RV64-NEXT:    sltu a2, a2, a7
 ; CHECK-RV64-NEXT:    addi a2, a2, -1
 ; CHECK-RV64-NEXT:    and a2, a2, a7

diff  --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
index 7f0ef443eeffc4..98ec99bcfea33e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
@@ -564,13 +564,13 @@ define void @strided_store_nxv16f64(<vscale x 16 x double> %v, ptr %ptr, i32 sig
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v8, (a0), a1, v0.t
 ; CHECK-NEXT:    sub a5, a2, a3
-; CHECK-NEXT:    sltu a2, a2, a5
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a5
 ; CHECK-NEXT:    mul a4, a4, a1
 ; CHECK-NEXT:    srli a3, a3, 3
-; CHECK-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    sltu a2, a2, a5
+; CHECK-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a5
 ; CHECK-NEXT:    add a0, a0, a4
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v16, (a0), a1, v0.t
@@ -582,19 +582,19 @@ define void @strided_store_nxv16f64(<vscale x 16 x double> %v, ptr %ptr, i32 sig
 define void @strided_store_nxv16f64_allones_mask(<vscale x 16 x double> %v, ptr %ptr, i32 signext %stride, i32 zeroext %evl) {
 ; CHECK-LABEL: strided_store_nxv16f64_allones_mask:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    mv a4, a2
-; CHECK-NEXT:    bltu a2, a3, .LBB47_2
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    bltu a2, a4, .LBB47_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a4, a3
+; CHECK-NEXT:    mv a3, a4
 ; CHECK-NEXT:  .LBB47_2:
-; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v8, (a0), a1
-; CHECK-NEXT:    sub a3, a2, a3
-; CHECK-NEXT:    sltu a2, a2, a3
+; CHECK-NEXT:    sub a4, a2, a4
+; CHECK-NEXT:    mul a3, a3, a1
+; CHECK-NEXT:    sltu a2, a2, a4
 ; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    mul a3, a4, a1
+; CHECK-NEXT:    and a2, a2, a4
 ; CHECK-NEXT:    add a0, a0, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v16, (a0), a1
@@ -609,9 +609,17 @@ declare void @llvm.experimental.vp.strided.store.nxv16f64.p0.i32(<vscale x 16 x
 define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 signext %stride, <vscale x 17 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: strided_store_nxv17f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a6, a4, 1
+; CHECK-NEXT:    slli a4, a4, 3
+; CHECK-NEXT:    sub sp, sp, a4
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    addi a4, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    slli a6, a4, 1
 ; CHECK-NEXT:    mv a5, a3
 ; CHECK-NEXT:    bltu a3, a6, .LBB48_2
 ; CHECK-NEXT:  # %bb.1:
@@ -622,33 +630,27 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a7, a4
 ; CHECK-NEXT:  .LBB48_4:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr t0, vlenb
-; CHECK-NEXT:    slli t0, t0, 3
-; CHECK-NEXT:    sub sp, sp, t0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vl8re64.v v0, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v8, (a1), a2, v0.t
 ; CHECK-NEXT:    sub a0, a5, a4
+; CHECK-NEXT:    mul a7, a7, a2
+; CHECK-NEXT:    srli t0, a4, 3
+; CHECK-NEXT:    sub a6, a3, a6
+; CHECK-NEXT:    vsetvli t1, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v24, t0
 ; CHECK-NEXT:    sltu t0, a5, a0
+; CHECK-NEXT:    add a7, a1, a7
+; CHECK-NEXT:    sltu a3, a3, a6
 ; CHECK-NEXT:    addi t0, t0, -1
-; CHECK-NEXT:    and t0, t0, a0
-; CHECK-NEXT:    mul a0, a7, a2
-; CHECK-NEXT:    add a7, a1, a0
-; CHECK-NEXT:    srli a0, a4, 3
-; CHECK-NEXT:    vsetvli t1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a0
-; CHECK-NEXT:    sub a0, a3, a6
-; CHECK-NEXT:    sltu a3, a3, a0
 ; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a0, a3, a0
+; CHECK-NEXT:    and t0, t0, a0
+; CHECK-NEXT:    and a0, a3, a6
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
-; CHECK-NEXT:    vsse64.v v16, (a7), a2, v0.t
+; CHECK-NEXT:    vsse64.v v8, (a7), a2, v0.t
 ; CHECK-NEXT:    bltu a0, a4, .LBB48_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a0, a4
@@ -658,10 +660,8 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
 ; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a4
 ; CHECK-NEXT:    add a1, a1, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vsse64.v v8, (a1), a2, v0.t
+; CHECK-NEXT:    vsse64.v v16, (a1), a2, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll b/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll
index c81e678900ab04..ab13c78da05e87 100644
--- a/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/undef-earlyclobber-chain.ll
@@ -41,34 +41,30 @@ define internal void @SubRegLivenessUndefInPhi(i64 %cond) {
 ; CHECK-NEXT:  # %bb.1: # %Cond1
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vadd.vi v10, v8, 1
-; CHECK-NEXT:    vadd.vi v12, v8, 3
+; CHECK-NEXT:    vadd.vi v12, v8, 1
+; CHECK-NEXT:    vadd.vi v10, v8, 3
 ; CHECK-NEXT:    j .LBB2_3
 ; CHECK-NEXT:  .LBB2_2: # %Cond2
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vid.v v9
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vadd.vi v10, v9, 1
+; CHECK-NEXT:    vadd.vi v11, v9, 3
 ; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vadd.vi v11, v9, 1
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v12, v10, a0
 ; CHECK-NEXT:    vslideup.vx v10, v11, a0
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v9, 3
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v12, v9, a0
 ; CHECK-NEXT:  .LBB2_3: # %UseSR
 ; CHECK-NEXT:    vl1r.v v14, (zero)
 ; CHECK-NEXT:    vsetivli zero, 4, e8, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v13, v14, v8
-; CHECK-NEXT:    vrgatherei16.vv v8, v14, v10
+; CHECK-NEXT:    vrgatherei16.vv v15, v14, v8
+; CHECK-NEXT:    vrgatherei16.vv v8, v14, v12
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vv v8, v13, v8
+; CHECK-NEXT:    vand.vv v8, v15, v8
 ; CHECK-NEXT:    vsetivli zero, 4, e8, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v9, v14, v12
+; CHECK-NEXT:    vrgatherei16.vv v9, v14, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vand.vv v8, v8, v9
 ; CHECK-NEXT:    vs1r.v v8, (zero)
@@ -120,10 +116,10 @@ define internal void @SubRegLivenessUndef() {
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vl1r.v v14, (zero)
 ; CHECK-NEXT:    vsetivli zero, 4, e8, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v13, v14, v8
+; CHECK-NEXT:    vrgatherei16.vv v15, v14, v8
 ; CHECK-NEXT:    vrgatherei16.vv v9, v14, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vv v9, v13, v9
+; CHECK-NEXT:    vand.vv v9, v15, v9
 ; CHECK-NEXT:    vsetivli zero, 4, e8, m1, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v11, v14, v12
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll b/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll
index 52c2cace185f78..0bd82e654e021c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/urem-seteq-vec.ll
@@ -9,11 +9,11 @@ define <vscale x 1 x i16> @test_urem_vec_even_divisor_eq0(<vscale x 1 x i16> %x)
 ; RV32-NEXT:    addi a0, a0, -1365
 ; RV32-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; RV32-NEXT:    vmul.vx v8, v8, a0
+; RV32-NEXT:    lui a0, 3
+; RV32-NEXT:    addi a0, a0, -1366
 ; RV32-NEXT:    vsll.vi v9, v8, 15
 ; RV32-NEXT:    vsrl.vi v8, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    lui a0, 3
-; RV32-NEXT:    addi a0, a0, -1366
 ; RV32-NEXT:    vmsgtu.vx v0, v8, a0
 ; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
@@ -25,11 +25,11 @@ define <vscale x 1 x i16> @test_urem_vec_even_divisor_eq0(<vscale x 1 x i16> %x)
 ; RV64-NEXT:    addi a0, a0, -1365
 ; RV64-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    lui a0, 3
+; RV64-NEXT:    addi a0, a0, -1366
 ; RV64-NEXT:    vsll.vi v9, v8, 15
 ; RV64-NEXT:    vsrl.vi v8, v8, 1
 ; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 3
-; RV64-NEXT:    addi a0, a0, -1366
 ; RV64-NEXT:    vmsgtu.vx v0, v8, a0
 ; RV64-NEXT:    vmv.v.i v8, 0
 ; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
@@ -81,11 +81,11 @@ define <vscale x 1 x i16> @test_urem_vec_even_divisor_eq1(<vscale x 1 x i16> %x)
 ; RV32-NEXT:    lui a0, 1048571
 ; RV32-NEXT:    addi a0, a0, -1365
 ; RV32-NEXT:    vmul.vx v8, v8, a0
+; RV32-NEXT:    lui a0, 3
+; RV32-NEXT:    addi a0, a0, -1366
 ; RV32-NEXT:    vsll.vi v9, v8, 15
 ; RV32-NEXT:    vsrl.vi v8, v8, 1
 ; RV32-NEXT:    vor.vv v8, v8, v9
-; RV32-NEXT:    lui a0, 3
-; RV32-NEXT:    addi a0, a0, -1366
 ; RV32-NEXT:    vmsgtu.vx v0, v8, a0
 ; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
@@ -99,11 +99,11 @@ define <vscale x 1 x i16> @test_urem_vec_even_divisor_eq1(<vscale x 1 x i16> %x)
 ; RV64-NEXT:    lui a0, 1048571
 ; RV64-NEXT:    addi a0, a0, -1365
 ; RV64-NEXT:    vmul.vx v8, v8, a0
+; RV64-NEXT:    lui a0, 3
+; RV64-NEXT:    addi a0, a0, -1366
 ; RV64-NEXT:    vsll.vi v9, v8, 15
 ; RV64-NEXT:    vsrl.vi v8, v8, 1
 ; RV64-NEXT:    vor.vv v8, v8, v9
-; RV64-NEXT:    lui a0, 3
-; RV64-NEXT:    addi a0, a0, -1366
 ; RV64-NEXT:    vmsgtu.vx v0, v8, a0
 ; RV64-NEXT:    vmv.v.i v8, 0
 ; RV64-NEXT:    vmerge.vim v8, v8, -1, v0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
index a77208404911cc..ebd550013ec78f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
@@ -1346,9 +1346,9 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
@@ -1401,10 +1401,10 @@ define <vscale x 32 x i32> @vadd_vi_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 2
-; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a1
+; CHECK-NEXT:    srli a2, a0, 2
 ; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode.ll
index b810c996010438..de15e185998c48 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdiv-sdnode.ll
@@ -797,16 +797,16 @@ define <vscale x 1 x i64> @vdiv_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
 ; RV32-V-NEXT:    addi sp, sp, -16
 ; RV32-V-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-V-NEXT:    lui a0, 748983
-; RV32-V-NEXT:    addi a0, a0, -586
 ; RV32-V-NEXT:    lui a1, 898779
+; RV32-V-NEXT:    addi a0, a0, -586
 ; RV32-V-NEXT:    addi a1, a1, 1755
 ; RV32-V-NEXT:    sw a1, 8(sp)
 ; RV32-V-NEXT:    sw a0, 12(sp)
 ; RV32-V-NEXT:    addi a0, sp, 8
 ; RV32-V-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32-V-NEXT:    vlse64.v v9, (a0), zero
-; RV32-V-NEXT:    vmulh.vv v8, v8, v9
 ; RV32-V-NEXT:    li a0, 63
+; RV32-V-NEXT:    vmulh.vv v8, v8, v9
 ; RV32-V-NEXT:    vsrl.vx v9, v8, a0
 ; RV32-V-NEXT:    vsra.vi v8, v8, 1
 ; RV32-V-NEXT:    vadd.vv v8, v8, v9
@@ -878,16 +878,16 @@ define <vscale x 2 x i64> @vdiv_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
 ; RV32-V-NEXT:    addi sp, sp, -16
 ; RV32-V-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-V-NEXT:    lui a0, 748983
-; RV32-V-NEXT:    addi a0, a0, -586
 ; RV32-V-NEXT:    lui a1, 898779
+; RV32-V-NEXT:    addi a0, a0, -586
 ; RV32-V-NEXT:    addi a1, a1, 1755
 ; RV32-V-NEXT:    sw a1, 8(sp)
 ; RV32-V-NEXT:    sw a0, 12(sp)
 ; RV32-V-NEXT:    addi a0, sp, 8
 ; RV32-V-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32-V-NEXT:    vlse64.v v10, (a0), zero
-; RV32-V-NEXT:    vmulh.vv v8, v8, v10
 ; RV32-V-NEXT:    li a0, 63
+; RV32-V-NEXT:    vmulh.vv v8, v8, v10
 ; RV32-V-NEXT:    vsrl.vx v10, v8, a0
 ; RV32-V-NEXT:    vsra.vi v8, v8, 1
 ; RV32-V-NEXT:    vadd.vv v8, v8, v10
@@ -959,16 +959,16 @@ define <vscale x 4 x i64> @vdiv_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
 ; RV32-V-NEXT:    addi sp, sp, -16
 ; RV32-V-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-V-NEXT:    lui a0, 748983
-; RV32-V-NEXT:    addi a0, a0, -586
 ; RV32-V-NEXT:    lui a1, 898779
+; RV32-V-NEXT:    addi a0, a0, -586
 ; RV32-V-NEXT:    addi a1, a1, 1755
 ; RV32-V-NEXT:    sw a1, 8(sp)
 ; RV32-V-NEXT:    sw a0, 12(sp)
 ; RV32-V-NEXT:    addi a0, sp, 8
 ; RV32-V-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32-V-NEXT:    vlse64.v v12, (a0), zero
-; RV32-V-NEXT:    vmulh.vv v8, v8, v12
 ; RV32-V-NEXT:    li a0, 63
+; RV32-V-NEXT:    vmulh.vv v8, v8, v12
 ; RV32-V-NEXT:    vsrl.vx v12, v8, a0
 ; RV32-V-NEXT:    vsra.vi v8, v8, 1
 ; RV32-V-NEXT:    vadd.vv v8, v8, v12
@@ -1040,16 +1040,16 @@ define <vscale x 8 x i64> @vdiv_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
 ; RV32-V-NEXT:    addi sp, sp, -16
 ; RV32-V-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-V-NEXT:    lui a0, 748983
-; RV32-V-NEXT:    addi a0, a0, -586
 ; RV32-V-NEXT:    lui a1, 898779
+; RV32-V-NEXT:    addi a0, a0, -586
 ; RV32-V-NEXT:    addi a1, a1, 1755
 ; RV32-V-NEXT:    sw a1, 8(sp)
 ; RV32-V-NEXT:    sw a0, 12(sp)
 ; RV32-V-NEXT:    addi a0, sp, 8
 ; RV32-V-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-V-NEXT:    vlse64.v v16, (a0), zero
-; RV32-V-NEXT:    vmulh.vv v8, v8, v16
 ; RV32-V-NEXT:    li a0, 63
+; RV32-V-NEXT:    vmulh.vv v8, v8, v16
 ; RV32-V-NEXT:    vsrl.vx v16, v8, a0
 ; RV32-V-NEXT:    vsra.vi v8, v8, 1
 ; RV32-V-NEXT:    vadd.vv v8, v8, v16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
index 470d61aa96a23f..c7b5200979370e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
@@ -11,10 +11,10 @@ define <vscale x 8 x i7> @vdiv_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
 ; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
 ; CHECK-NEXT:    vdiv.vv v8, v8, v9, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
index c29f305cbfb217..850ad863dd384e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
@@ -10,11 +10,10 @@ define <vscale x 8 x i7> @vdivu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <
 ; CHECK-LABEL: vdivu_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vdivu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
index b19995a5aba9a4..8e3cedfbeeb032 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
@@ -14,24 +14,24 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) {
 ; RV32-NEXT:    lw a0, 0(a0)
 ; RV32-NEXT:    srli a2, a0, 16
 ; RV32-NEXT:    slli a3, a0, 16
-; RV32-NEXT:    srli a4, a3, 24
+; RV32-NEXT:    slli a4, a0, 24
+; RV32-NEXT:    slli a5, a0, 8
+; RV32-NEXT:    srli a6, a3, 24
 ; RV32-NEXT:    srai a3, a3, 24
-; RV32-NEXT:    slli a5, a0, 24
+; RV32-NEXT:    srai a4, a4, 24
 ; RV32-NEXT:    srai a5, a5, 24
-; RV32-NEXT:    slli a6, a0, 8
-; RV32-NEXT:    srai a6, a6, 24
-; RV32-NEXT:    sgtz a6, a6
 ; RV32-NEXT:    sgtz a5, a5
+; RV32-NEXT:    sgtz a4, a4
 ; RV32-NEXT:    sgtz a3, a3
 ; RV32-NEXT:    neg a3, a3
-; RV32-NEXT:    and a3, a3, a4
-; RV32-NEXT:    slli a3, a3, 8
-; RV32-NEXT:    neg a4, a5
+; RV32-NEXT:    neg a4, a4
+; RV32-NEXT:    neg a5, a5
+; RV32-NEXT:    and a3, a3, a6
 ; RV32-NEXT:    and a0, a4, a0
+; RV32-NEXT:    and a2, a5, a2
+; RV32-NEXT:    slli a3, a3, 8
 ; RV32-NEXT:    andi a0, a0, 255
 ; RV32-NEXT:    or a0, a0, a3
-; RV32-NEXT:    neg a3, a6
-; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    sh a0, 0(a1)
 ; RV32-NEXT:    sb a2, 2(a1)
 ; RV32-NEXT:    ret
@@ -41,24 +41,24 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) {
 ; RV64-NEXT:    lw a0, 0(a0)
 ; RV64-NEXT:    srliw a2, a0, 16
 ; RV64-NEXT:    slli a3, a0, 48
-; RV64-NEXT:    srli a4, a3, 56
+; RV64-NEXT:    slli a4, a0, 56
+; RV64-NEXT:    slli a5, a0, 40
+; RV64-NEXT:    srli a6, a3, 56
 ; RV64-NEXT:    srai a3, a3, 56
-; RV64-NEXT:    slli a5, a0, 56
+; RV64-NEXT:    srai a4, a4, 56
 ; RV64-NEXT:    srai a5, a5, 56
-; RV64-NEXT:    slli a6, a0, 40
-; RV64-NEXT:    srai a6, a6, 56
-; RV64-NEXT:    sgtz a6, a6
 ; RV64-NEXT:    sgtz a5, a5
+; RV64-NEXT:    sgtz a4, a4
 ; RV64-NEXT:    sgtz a3, a3
 ; RV64-NEXT:    negw a3, a3
-; RV64-NEXT:    and a3, a3, a4
-; RV64-NEXT:    slli a3, a3, 8
-; RV64-NEXT:    negw a4, a5
+; RV64-NEXT:    negw a4, a4
+; RV64-NEXT:    negw a5, a5
+; RV64-NEXT:    and a3, a3, a6
 ; RV64-NEXT:    and a0, a4, a0
+; RV64-NEXT:    and a2, a5, a2
+; RV64-NEXT:    slli a3, a3, 8
 ; RV64-NEXT:    andi a0, a0, 255
 ; RV64-NEXT:    or a0, a0, a3
-; RV64-NEXT:    negw a3, a6
-; RV64-NEXT:    and a2, a3, a2
 ; RV64-NEXT:    sh a0, 0(a1)
 ; RV64-NEXT:    sb a2, 2(a1)
 ; RV64-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index 075e463e41a6b0..54d2f3f68989bf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -11,25 +11,26 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) {
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 2
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v10, v9, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v12, v9, 1, v0
-; CHECK-NEXT:    vnsrl.wi v8, v12, 0
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vid.v v9
-; CHECK-NEXT:    vadd.vv v11, v9, v9
 ; CHECK-NEXT:    li a0, -256
+; CHECK-NEXT:    vmerge.vim v11, v10, 1, v0
+; CHECK-NEXT:    vadd.vv v12, v9, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.s.x v0, a0
+; CHECK-NEXT:    vmv.s.x v9, a0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; CHECK-NEXT:    vadd.vi v9, v11, -16
-; CHECK-NEXT:    vrgather.vv v8, v10, v9, v0.t
-; CHECK-NEXT:    vmsne.vi v9, v8, 0
-; CHECK-NEXT:    vnsrl.wi v8, v12, 8
-; CHECK-NEXT:    vadd.vi v11, v11, -15
-; CHECK-NEXT:    vrgather.vv v8, v10, v11, v0.t
+; CHECK-NEXT:    vmerge.vim v14, v10, 1, v0
+; CHECK-NEXT:    vadd.vi v8, v12, -16
+; CHECK-NEXT:    vadd.vi v12, v12, -15
+; CHECK-NEXT:    vnsrl.wi v10, v14, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vrgather.vv v10, v11, v8, v0.t
+; CHECK-NEXT:    vnsrl.wi v8, v14, 8
+; CHECK-NEXT:    vmsne.vi v10, v10, 0
+; CHECK-NEXT:    vrgather.vv v8, v11, v12, v0.t
 ; CHECK-NEXT:    vmsne.vi v8, v8, 0
-; CHECK-NEXT:    vmv.v.v v0, v9
+; CHECK-NEXT:    vmv.v.v v0, v10
 ; CHECK-NEXT:    ret
 %retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec)
 ret {<16 x i1>, <16 x i1>} %retval
@@ -95,16 +96,17 @@ define {<4 x i64>, <4 x i64>} @vector_deinterleave_v4i64_v8i64(<8 x i64> %vec) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v12
+; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v16, v8, 4
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 12
 ; CHECK-NEXT:    vadd.vv v14, v12, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v14
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v15, v14, -4
-; CHECK-NEXT:    vmv.v.i v0, 12
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v8, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v12, v16, v15, v0.t
+; CHECK-NEXT:    vadd.vi v10, v14, -4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vrgatherei16.vv v12, v16, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vadd.vi v15, v14, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
@@ -122,27 +124,28 @@ define {<4 x i64>, <4 x i64>} @vector_deinterleave_v4i64_v8i64(<8 x i64> %vec) {
 define {<8 x i64>, <8 x i64>} @vector_deinterleave_v8i64_v16i64(<16 x i64> %vec) {
 ; CHECK-LABEL: vector_deinterleave_v8i64_v16i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv8r.v v16, v8
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vadd.vv v7, v8, v8
-; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v8, v16, v7
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v16
 ; CHECK-NEXT:    vmv.v.i v0, -16
-; CHECK-NEXT:    vadd.vi v12, v7, -8
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v16, 8
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v8, v24, v12, v0.t
+; CHECK-NEXT:    vslidedown.vi v24, v8, 8
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v20, v16, v16
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v16, v8, v20
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vadd.vi v12, v20, -8
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
+; CHECK-NEXT:    vrgatherei16.vv v16, v24, v12, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v20, v7, 1
+; CHECK-NEXT:    vadd.vi v21, v20, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v12, v16, v20
+; CHECK-NEXT:    vrgatherei16.vv v12, v8, v21
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vadd.vi v16, v7, -7
+; CHECK-NEXT:    vadd.vi v8, v20, -7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v12, v24, v16, v0.t
+; CHECK-NEXT:    vrgatherei16.vv v12, v24, v8, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
   %retval = call {<8 x i64>, <8 x i64>} @llvm.vector.deinterleave2.v16i64(<16 x i64> %vec)
   ret {<8 x i64>, <8 x i64>} %retval
@@ -239,16 +242,17 @@ define {<4 x double>, <4 x double>} @vector_deinterleave_v4f64_v8f64(<8 x double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v12
+; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v16, v8, 4
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 12
 ; CHECK-NEXT:    vadd.vv v14, v12, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v12, v8, v14
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v15, v14, -4
-; CHECK-NEXT:    vmv.v.i v0, 12
-; CHECK-NEXT:    vsetivli zero, 4, e64, m4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v16, v8, 4
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v12, v16, v15, v0.t
+; CHECK-NEXT:    vadd.vi v10, v14, -4
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vrgatherei16.vv v12, v16, v10, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vadd.vi v15, v14, 1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index f4c7f0f13e984b..39a1bfcda3d83f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -21,8 +21,8 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_load_nxv16i
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
 ; CHECK-NEXT:    vnsrl.wi v8, v12, 0
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    vnsrl.wi v10, v12, 8
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    vmsne.vi v8, v10, 0
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 32 x i1>, ptr %p
@@ -106,81 +106,96 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_load_nxv8i6
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    li a2, 40
+; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    vl8re64.v v16, (a0)
+; CHECK-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vid.v v8
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    vadd.vv v24, v8, v8
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v0, (a1)
+; CHECK-NEXT:    vadd.vi v8, v24, 1
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vadd.vv v16, v8, v8
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vrgather.vv v8, v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vrgather.vv v24, v8, v16
-; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vrgather.vv v24, v16, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vrgather.vv v8, v0, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vadd.vi v8, v16, 1
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vrgather.vv v24, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vrgather.vv v16, v0, v8
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vrgather.vv v24, v0, v8
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vrgather.vv v16, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv4r.v v12, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv4r.v v28, v8
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv4r.v v20, v8
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv4r.v v28, v16
+; CHECK-NEXT:    vmv8r.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    li a1, 40
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index 201cfaa931b417..f20a90a4223139 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -20,8 +20,8 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v14, v10, 1, v0
 ; CHECK-NEXT:    vnsrl.wi v8, v12, 0
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    vnsrl.wi v10, v12, 8
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    vmsne.vi v8, v10, 0
 ; CHECK-NEXT:    ret
 %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
@@ -109,40 +109,20 @@ declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4
 define {<vscale x 64 x i1>, <vscale x 64 x i1>} @vector_deinterleave_nxv64i1_nxv128i1(<vscale x 128 x i1> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv64i1_nxv128i1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vmv1r.v v12, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v24, 0
 ; CHECK-NEXT:    vmerge.vim v16, v24, 1, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v16, 0
-; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v24, v24, 1, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vnsrl.wi v8, v16, 0
+; CHECK-NEXT:    vnsrl.wi v0, v16, 8
 ; CHECK-NEXT:    vnsrl.wi v12, v24, 0
+; CHECK-NEXT:    vnsrl.wi v4, v24, 8
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v16, v8, 8
-; CHECK-NEXT:    vnsrl.wi v20, v24, 8
-; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmsne.vi v8, v16, 0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vmsne.vi v16, v8, 0
+; CHECK-NEXT:    vmsne.vi v8, v0, 0
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    ret
 %retval = call {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.vector.deinterleave2.nxv128i1(<vscale x 128 x i1> %vec)
 ret {<vscale x 64 x i1>, <vscale x 64 x i1>} %retval
@@ -154,8 +134,8 @@ define {<vscale x 64 x i8>, <vscale x 64 x i8>} @vector_deinterleave_nxv64i8_nxv
 ; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v24, 0
-; CHECK-NEXT:    vnsrl.wi v12, v16, 0
 ; CHECK-NEXT:    vnsrl.wi v0, v24, 8
+; CHECK-NEXT:    vnsrl.wi v12, v16, 0
 ; CHECK-NEXT:    vnsrl.wi v4, v16, 8
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret
@@ -169,8 +149,8 @@ define {<vscale x 32 x i16>, <vscale x 32 x i16>} @vector_deinterleave_nxv32i16_
 ; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v24, 0
-; CHECK-NEXT:    vnsrl.wi v12, v16, 0
 ; CHECK-NEXT:    vnsrl.wi v0, v24, 16
+; CHECK-NEXT:    vnsrl.wi v12, v16, 0
 ; CHECK-NEXT:    vnsrl.wi v4, v16, 16
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret
@@ -410,8 +390,8 @@ define {<vscale x 32 x bfloat>, <vscale x 32 x bfloat>} @vector_deinterleave_nxv
 ; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v24, 0
-; CHECK-NEXT:    vnsrl.wi v12, v16, 0
 ; CHECK-NEXT:    vnsrl.wi v0, v24, 16
+; CHECK-NEXT:    vnsrl.wi v12, v16, 0
 ; CHECK-NEXT:    vnsrl.wi v4, v16, 16
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret
@@ -425,8 +405,8 @@ define {<vscale x 32 x half>, <vscale x 32 x half>} @vector_deinterleave_nxv32f1
 ; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v24, 0
-; CHECK-NEXT:    vnsrl.wi v12, v16, 0
 ; CHECK-NEXT:    vnsrl.wi v0, v24, 16
+; CHECK-NEXT:    vnsrl.wi v12, v16, 0
 ; CHECK-NEXT:    vnsrl.wi v4, v16, 16
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
index e730ae230d5a0d..7b0ac01918b9bd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
@@ -9,9 +9,9 @@
 define <32 x i1> @vector_interleave_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b) {
 ; CHECK-LABEL: vector_interleave_v32i1_v16i1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vi v0, v8, 2
-; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
@@ -27,9 +27,9 @@ define <32 x i1> @vector_interleave_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b) {
 ;
 ; ZVBB-LABEL: vector_interleave_v32i1_v16i1:
 ; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    li a0, 32
 ; ZVBB-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
 ; ZVBB-NEXT:    vslideup.vi v0, v8, 2
-; ZVBB-NEXT:    li a0, 32
 ; ZVBB-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; ZVBB-NEXT:    vmv.v.i v8, 0
 ; ZVBB-NEXT:    vmerge.vim v8, v8, 1, v0
@@ -92,10 +92,10 @@ define <4 x i64> @vector_interleave_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: vector_interleave_v4i64_v2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v9
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 2
 ; CHECK-NEXT:    lui a0, 12304
 ; CHECK-NEXT:    addi a0, a0, 512
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 2
 ; CHECK-NEXT:    vmv.s.x v10, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsext.vf2 v12, v10
@@ -107,10 +107,10 @@ define <4 x i64> @vector_interleave_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; ZVBB-LABEL: vector_interleave_v4i64_v2i64:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    vmv1r.v v10, v9
-; ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; ZVBB-NEXT:    vslideup.vi v8, v10, 2
 ; ZVBB-NEXT:    lui a0, 12304
 ; ZVBB-NEXT:    addi a0, a0, 512
+; ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; ZVBB-NEXT:    vslideup.vi v8, v10, 2
 ; ZVBB-NEXT:    vmv.s.x v10, a0
 ; ZVBB-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vsext.vf2 v12, v10
@@ -240,10 +240,10 @@ define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double>
 ; CHECK-LABEL: vector_interleave_v4f64_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v9
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 2
 ; CHECK-NEXT:    lui a0, 12304
 ; CHECK-NEXT:    addi a0, a0, 512
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 2
 ; CHECK-NEXT:    vmv.s.x v10, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsext.vf2 v12, v10
@@ -255,10 +255,10 @@ define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double>
 ; ZVBB-LABEL: vector_interleave_v4f64_v2f64:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    vmv1r.v v10, v9
-; ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; ZVBB-NEXT:    vslideup.vi v8, v10, 2
 ; ZVBB-NEXT:    lui a0, 12304
 ; ZVBB-NEXT:    addi a0, a0, 512
+; ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; ZVBB-NEXT:    vslideup.vi v8, v10, 2
 ; ZVBB-NEXT:    vmv.s.x v10, a0
 ; ZVBB-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vsext.vf2 v12, v10

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index 16ce25f86462e3..bc203e215d8786 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -10,22 +10,22 @@ define void @vector_interleave_store_nxv32i1_nxv16i1(<vscale x 16 x i1> %a, <vsc
 ; CHECK-LABEL: vector_interleave_store_nxv32i1_nxv16i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    vwaddu.vv v16, v8, v12
-; CHECK-NEXT:    li a1, -1
 ; CHECK-NEXT:    vwmaccu.vx v16, a1, v12
 ; CHECK-NEXT:    vmsne.vi v8, v18, 0
 ; CHECK-NEXT:    vmsne.vi v9, v16, 0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v9, v8, a1
+; CHECK-NEXT:    add a1, a2, a2
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a2
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsm.v v9, (a0)
 ; CHECK-NEXT:    ret
@@ -97,50 +97,24 @@ define void @vector_interleave_store_nxv8i64_nxv4i64(<vscale x 4 x i64> %a, <vsc
 define void @vector_interleave_store_nxv16i64_nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, ptr %p) {
 ; CHECK-LABEL: vector_interleave_store_nxv16i64_nxv8i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, mu
+; CHECK-NEXT:    vid.v v6
+; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    srli a2, a1, 1
-; CHECK-NEXT:    vsetvli a3, zero, e16, m2, ta, mu
-; CHECK-NEXT:    vid.v v24
-; CHECK-NEXT:    vand.vi v26, v24, 1
-; CHECK-NEXT:    vmsne.vi v28, v26, 0
-; CHECK-NEXT:    vsrl.vi v24, v24, 1
-; CHECK-NEXT:    vmv1r.v v0, v28
-; CHECK-NEXT:    vadd.vx v24, v24, a2, v0.t
-; CHECK-NEXT:    vmv4r.v v12, v16
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v0, v8, v24
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv4r.v v28, v16
 ; CHECK-NEXT:    vmv4r.v v16, v12
-; CHECK-NEXT:    vrgatherei16.vv v8, v16, v24
+; CHECK-NEXT:    vsrl.vi v8, v6, 1
+; CHECK-NEXT:    vand.vi v10, v6, 1
 ; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    vmsne.vi v0, v10, 0
 ; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    vs8r.v v8, (a1)
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vs8r.v v8, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vadd.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v0, v24, v8
+; CHECK-NEXT:    vrgatherei16.vv v24, v16, v8
+; CHECK-NEXT:    vs8r.v v24, (a1)
+; CHECK-NEXT:    vs8r.v v0, (a0)
 ; CHECK-NEXT:    ret
   %res = call <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
   store <vscale x 16 x i64> %res, ptr %p

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index 4d14d0013236f2..26e9afcb1d109b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -12,42 +12,42 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-LABEL: vector_interleave_nxv32i1_nxv16i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    vwaddu.vv v16, v8, v12
-; CHECK-NEXT:    li a0, -1
 ; CHECK-NEXT:    vwmaccu.vx v16, a0, v12
 ; CHECK-NEXT:    vmsne.vi v8, v18, 0
 ; CHECK-NEXT:    vmsne.vi v0, v16, 0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v0, v8, a0
+; CHECK-NEXT:    add a0, a1, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v8, a1
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv32i1_nxv16i1:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    vmv1r.v v9, v0
+; ZVBB-NEXT:    vmv1r.v v0, v8
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, mu
 ; ZVBB-NEXT:    vmv.v.i v10, 0
-; ZVBB-NEXT:    vmv1r.v v0, v8
+; ZVBB-NEXT:    li a0, 1
+; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    vmerge.vim v10, v10, 1, v0
+; ZVBB-NEXT:    srli a1, a1, 2
 ; ZVBB-NEXT:    vwsll.vi v12, v10, 8
-; ZVBB-NEXT:    li a0, 1
 ; ZVBB-NEXT:    vmv1r.v v0, v9
 ; ZVBB-NEXT:    vwaddu.wx v12, v12, a0, v0.t
 ; ZVBB-NEXT:    vmsne.vi v8, v14, 0
 ; ZVBB-NEXT:    vmsne.vi v0, v12, 0
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    srli a0, a0, 2
-; ZVBB-NEXT:    add a1, a0, a0
-; ZVBB-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; ZVBB-NEXT:    vslideup.vx v0, v8, a0
+; ZVBB-NEXT:    add a0, a1, a1
+; ZVBB-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v8, a1
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
   ret <vscale x 32 x i1> %res
@@ -121,9 +121,9 @@ define <vscale x 4 x i64> @vector_interleave_nxv4i64_nxv2i64(<vscale x 2 x i64>
 ; CHECK-LABEL: vector_interleave_nxv4i64_nxv2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vid.v v12
+; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vand.vi v13, v12, 1
 ; CHECK-NEXT:    vmsne.vi v0, v13, 0
 ; CHECK-NEXT:    vsrl.vi v16, v12, 1
@@ -136,9 +136,9 @@ define <vscale x 4 x i64> @vector_interleave_nxv4i64_nxv2i64(<vscale x 2 x i64>
 ; ZVBB-LABEL: vector_interleave_nxv4i64_nxv2i64:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    srli a0, a0, 2
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, mu
 ; ZVBB-NEXT:    vid.v v12
+; ZVBB-NEXT:    srli a0, a0, 2
 ; ZVBB-NEXT:    vand.vi v13, v12, 1
 ; ZVBB-NEXT:    vmsne.vi v0, v13, 0
 ; ZVBB-NEXT:    vsrl.vi v16, v12, 1
@@ -161,23 +161,22 @@ define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv64i1(<vscale x 64 x i1
 ; CHECK-LABEL: vector_interleave_nxv128i1_nxv64i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v24, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    li a0, -1
 ; CHECK-NEXT:    vmerge.vim v16, v24, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
-; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vwaddu.vv v24, v8, v16
-; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v24, a0, v16
-; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v24, 0
+; CHECK-NEXT:    vmerge.vim v24, v24, 1, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vwaddu.vv v24, v12, v20
-; CHECK-NEXT:    vwmaccu.vx v24, a0, v20
+; CHECK-NEXT:    vwaddu.vv v8, v24, v16
+; CHECK-NEXT:    vwaddu.vv v0, v28, v20
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v16
+; CHECK-NEXT:    vwmaccu.vx v0, a0, v20
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmsne.vi v8, v24, 0
+; CHECK-NEXT:    vmsne.vi v16, v8, 0
+; CHECK-NEXT:    vmsne.vi v8, v0, 0
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv128i1_nxv64i1:
@@ -186,17 +185,16 @@ define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv64i1(<vscale x 64 x i1
 ; ZVBB-NEXT:    vmv.v.i v24, 0
 ; ZVBB-NEXT:    vmerge.vim v16, v24, 1, v0
 ; ZVBB-NEXT:    vmv1r.v v0, v8
-; ZVBB-NEXT:    vmerge.vim v8, v24, 1, v0
-; ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVBB-NEXT:    vwsll.vi v24, v8, 8
-; ZVBB-NEXT:    vwaddu.wv v24, v24, v16
-; ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; ZVBB-NEXT:    vmsne.vi v0, v24, 0
+; ZVBB-NEXT:    vmerge.vim v24, v24, 1, v0
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVBB-NEXT:    vwsll.vi v24, v12, 8
-; ZVBB-NEXT:    vwaddu.wv v24, v24, v20
+; ZVBB-NEXT:    vwsll.vi v8, v24, 8
+; ZVBB-NEXT:    vwsll.vi v0, v28, 8
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v16
+; ZVBB-NEXT:    vwaddu.wv v0, v0, v20
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; ZVBB-NEXT:    vmsne.vi v8, v24, 0
+; ZVBB-NEXT:    vmsne.vi v16, v8, 0
+; ZVBB-NEXT:    vmsne.vi v8, v0, 0
+; ZVBB-NEXT:    vmv1r.v v0, v16
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 128 x i1> @llvm.vector.interleave2.nxv128i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b)
   ret <vscale x 128 x i1> %res
@@ -209,8 +207,8 @@ define <vscale x 128 x i8> @vector_interleave_nxv128i8_nxv64i8(<vscale x 64 x i8
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vwaddu.vv v8, v24, v16
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v8, a0, v16
 ; CHECK-NEXT:    vwaddu.vv v0, v28, v20
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v16
 ; CHECK-NEXT:    vwmaccu.vx v0, a0, v20
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret
@@ -220,8 +218,8 @@ define <vscale x 128 x i8> @vector_interleave_nxv128i8_nxv64i8(<vscale x 64 x i8
 ; ZVBB-NEXT:    vmv8r.v v24, v8
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; ZVBB-NEXT:    vwsll.vi v8, v16, 8
-; ZVBB-NEXT:    vwaddu.wv v8, v8, v24
 ; ZVBB-NEXT:    vwsll.vi v0, v20, 8
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v24
 ; ZVBB-NEXT:    vwaddu.wv v0, v0, v28
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
@@ -236,8 +234,8 @@ define <vscale x 64 x i16> @vector_interleave_nxv64i16_nxv32i16(<vscale x 32 x i
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vwaddu.vv v8, v24, v16
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v8, a0, v16
 ; CHECK-NEXT:    vwaddu.vv v0, v28, v20
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v16
 ; CHECK-NEXT:    vwmaccu.vx v0, a0, v20
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret
@@ -247,8 +245,8 @@ define <vscale x 64 x i16> @vector_interleave_nxv64i16_nxv32i16(<vscale x 32 x i
 ; ZVBB-NEXT:    vmv8r.v v24, v8
 ; ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVBB-NEXT:    vwsll.vi v8, v16, 16
-; ZVBB-NEXT:    vwaddu.wv v8, v8, v24
 ; ZVBB-NEXT:    vwsll.vi v0, v20, 16
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v24
 ; ZVBB-NEXT:    vwaddu.wv v0, v0, v28
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
@@ -263,8 +261,8 @@ define <vscale x 32 x i32> @vector_interleave_nxv32i32_nxv16i32(<vscale x 16 x i
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vwaddu.vv v8, v24, v16
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v8, a0, v16
 ; CHECK-NEXT:    vwaddu.vv v0, v28, v20
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v16
 ; CHECK-NEXT:    vwmaccu.vx v0, a0, v20
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret
@@ -275,8 +273,8 @@ define <vscale x 32 x i32> @vector_interleave_nxv32i32_nxv16i32(<vscale x 16 x i
 ; ZVBB-NEXT:    li a0, 32
 ; ZVBB-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; ZVBB-NEXT:    vwsll.vx v8, v16, a0
-; ZVBB-NEXT:    vwaddu.wv v8, v8, v24
 ; ZVBB-NEXT:    vwsll.vx v0, v20, a0
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v24
 ; ZVBB-NEXT:    vwaddu.wv v0, v0, v28
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
@@ -287,76 +285,40 @@ define <vscale x 32 x i32> @vector_interleave_nxv32i32_nxv16i32(<vscale x 16 x i
 define <vscale x 16 x i64> @vector_interleave_nxv16i64_nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b) {
 ; CHECK-LABEL: vector_interleave_nxv16i64_nxv8i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vmv8r.v v0, v8
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, mu
-; CHECK-NEXT:    vid.v v24
-; CHECK-NEXT:    vand.vi v26, v24, 1
-; CHECK-NEXT:    vmsne.vi v10, v26, 0
-; CHECK-NEXT:    vsrl.vi v8, v24, 1
-; CHECK-NEXT:    vmv8r.v v24, v0
-; CHECK-NEXT:    vmv4r.v v12, v4
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vid.v v6
+; CHECK-NEXT:    vmv8r.v v24, v8
+; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vmv4r.v v28, v16
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v0, v24, v8
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv4r.v v16, v12
-; CHECK-NEXT:    vrgatherei16.vv v24, v16, v8
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vand.vi v8, v6, 1
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vsrl.vi v6, v6, 1
+; CHECK-NEXT:    vadd.vx v6, v6, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v8, v24, v6
+; CHECK-NEXT:    vrgatherei16.vv v24, v16, v6
 ; CHECK-NEXT:    vmv.v.v v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv16i64_nxv8i64:
 ; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 16
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a0, a0, 3
-; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVBB-NEXT:    vmv8r.v v0, v8
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    srli a0, a0, 1
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m2, ta, mu
-; ZVBB-NEXT:    vid.v v24
-; ZVBB-NEXT:    vand.vi v26, v24, 1
-; ZVBB-NEXT:    vmsne.vi v10, v26, 0
-; ZVBB-NEXT:    vsrl.vi v8, v24, 1
-; ZVBB-NEXT:    vmv8r.v v24, v0
-; ZVBB-NEXT:    vmv4r.v v12, v4
-; ZVBB-NEXT:    vmv1r.v v0, v10
-; ZVBB-NEXT:    vadd.vx v8, v8, a0, v0.t
+; ZVBB-NEXT:    vid.v v6
+; ZVBB-NEXT:    vmv8r.v v24, v8
+; ZVBB-NEXT:    srli a0, a0, 1
 ; ZVBB-NEXT:    vmv4r.v v28, v16
-; ZVBB-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; ZVBB-NEXT:    vrgatherei16.vv v0, v24, v8
-; ZVBB-NEXT:    addi a0, sp, 16
-; ZVBB-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; ZVBB-NEXT:    vmv4r.v v16, v12
-; ZVBB-NEXT:    vrgatherei16.vv v24, v16, v8
-; ZVBB-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVBB-NEXT:    vand.vi v8, v6, 1
+; ZVBB-NEXT:    vmsne.vi v0, v8, 0
+; ZVBB-NEXT:    vsrl.vi v6, v6, 1
+; ZVBB-NEXT:    vadd.vx v6, v6, a0, v0.t
+; ZVBB-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; ZVBB-NEXT:    vrgatherei16.vv v8, v24, v6
+; ZVBB-NEXT:    vrgatherei16.vv v24, v16, v6
 ; ZVBB-NEXT:    vmv.v.v v16, v24
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a0, a0, 3
-; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    .cfi_def_cfa sp, 16
-; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 0
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b)
   ret <vscale x 16 x i64> %res
@@ -376,14 +338,14 @@ define <vscale x 4 x bfloat> @vector_interleave_nxv4bf16_nxv2bf16(<vscale x 2 x
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vwaddu.vv v10, v8, v9
 ; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    vwmaccu.vx v10, a0, v9
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v10, a0
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v10, v8, a0
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v10, a1
+; CHECK-NEXT:    add a0, a1, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v8, a1
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 ;
@@ -391,8 +353,8 @@ define <vscale x 4 x bfloat> @vector_interleave_nxv4bf16_nxv2bf16(<vscale x 2 x
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vwsll.vi v10, v9, 16
-; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    srli a0, a0, 2
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslidedown.vx v8, v10, a0
@@ -432,14 +394,14 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vwaddu.vv v10, v8, v9
 ; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    vwmaccu.vx v10, a0, v9
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v10, a0
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v10, v8, a0
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v10, a1
+; CHECK-NEXT:    add a0, a1, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v8, a1
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 ;
@@ -447,8 +409,8 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVBB-NEXT:    vwsll.vi v10, v9, 16
-; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
 ; ZVBB-NEXT:    srli a0, a0, 2
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslidedown.vx v8, v10, a0
@@ -572,9 +534,9 @@ define <vscale x 4 x double> @vector_interleave_nxv4f64_nxv2f64(<vscale x 2 x do
 ; CHECK-LABEL: vector_interleave_nxv4f64_nxv2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vid.v v12
+; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vand.vi v13, v12, 1
 ; CHECK-NEXT:    vmsne.vi v0, v13, 0
 ; CHECK-NEXT:    vsrl.vi v16, v12, 1
@@ -587,9 +549,9 @@ define <vscale x 4 x double> @vector_interleave_nxv4f64_nxv2f64(<vscale x 2 x do
 ; ZVBB-LABEL: vector_interleave_nxv4f64_nxv2f64:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    srli a0, a0, 2
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, mu
 ; ZVBB-NEXT:    vid.v v12
+; ZVBB-NEXT:    srli a0, a0, 2
 ; ZVBB-NEXT:    vand.vi v13, v12, 1
 ; ZVBB-NEXT:    vmsne.vi v0, v13, 0
 ; ZVBB-NEXT:    vsrl.vi v16, v12, 1
@@ -617,8 +579,8 @@ define <vscale x 64 x bfloat> @vector_interleave_nxv64bf16_nxv32bf16(<vscale x 3
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vwaddu.vv v8, v24, v16
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v8, a0, v16
 ; CHECK-NEXT:    vwaddu.vv v0, v28, v20
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v16
 ; CHECK-NEXT:    vwmaccu.vx v0, a0, v20
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret
@@ -628,8 +590,8 @@ define <vscale x 64 x bfloat> @vector_interleave_nxv64bf16_nxv32bf16(<vscale x 3
 ; ZVBB-NEXT:    vmv8r.v v24, v8
 ; ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVBB-NEXT:    vwsll.vi v8, v16, 16
-; ZVBB-NEXT:    vwaddu.wv v8, v8, v24
 ; ZVBB-NEXT:    vwsll.vi v0, v20, 16
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v24
 ; ZVBB-NEXT:    vwaddu.wv v0, v0, v28
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
@@ -644,8 +606,8 @@ define <vscale x 64 x half> @vector_interleave_nxv64f16_nxv32f16(<vscale x 32 x
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vwaddu.vv v8, v24, v16
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v8, a0, v16
 ; CHECK-NEXT:    vwaddu.vv v0, v28, v20
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v16
 ; CHECK-NEXT:    vwmaccu.vx v0, a0, v20
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret
@@ -655,8 +617,8 @@ define <vscale x 64 x half> @vector_interleave_nxv64f16_nxv32f16(<vscale x 32 x
 ; ZVBB-NEXT:    vmv8r.v v24, v8
 ; ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVBB-NEXT:    vwsll.vi v8, v16, 16
-; ZVBB-NEXT:    vwaddu.wv v8, v8, v24
 ; ZVBB-NEXT:    vwsll.vi v0, v20, 16
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v24
 ; ZVBB-NEXT:    vwaddu.wv v0, v0, v28
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
@@ -671,8 +633,8 @@ define <vscale x 32 x float> @vector_interleave_nxv32f32_nxv16f32(<vscale x 16 x
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vwaddu.vv v8, v24, v16
 ; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    vwmaccu.vx v8, a0, v16
 ; CHECK-NEXT:    vwaddu.vv v0, v28, v20
+; CHECK-NEXT:    vwmaccu.vx v8, a0, v16
 ; CHECK-NEXT:    vwmaccu.vx v0, a0, v20
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    ret
@@ -683,8 +645,8 @@ define <vscale x 32 x float> @vector_interleave_nxv32f32_nxv16f32(<vscale x 16 x
 ; ZVBB-NEXT:    li a0, 32
 ; ZVBB-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; ZVBB-NEXT:    vwsll.vx v8, v16, a0
-; ZVBB-NEXT:    vwaddu.wv v8, v8, v24
 ; ZVBB-NEXT:    vwsll.vx v0, v20, a0
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v24
 ; ZVBB-NEXT:    vwaddu.wv v0, v0, v28
 ; ZVBB-NEXT:    vmv8r.v v16, v0
 ; ZVBB-NEXT:    ret
@@ -695,76 +657,40 @@ define <vscale x 32 x float> @vector_interleave_nxv32f32_nxv16f32(<vscale x 16 x
 define <vscale x 16 x double> @vector_interleave_nxv16f64_nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b) {
 ; CHECK-LABEL: vector_interleave_nxv16f64_nxv8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vmv8r.v v0, v8
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, mu
-; CHECK-NEXT:    vid.v v24
-; CHECK-NEXT:    vand.vi v26, v24, 1
-; CHECK-NEXT:    vmsne.vi v10, v26, 0
-; CHECK-NEXT:    vsrl.vi v8, v24, 1
-; CHECK-NEXT:    vmv8r.v v24, v0
-; CHECK-NEXT:    vmv4r.v v12, v4
-; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vid.v v6
+; CHECK-NEXT:    vmv8r.v v24, v8
+; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vmv4r.v v28, v16
-; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v0, v24, v8
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv4r.v v16, v12
-; CHECK-NEXT:    vrgatherei16.vv v24, v16, v8
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vand.vi v8, v6, 1
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vsrl.vi v6, v6, 1
+; CHECK-NEXT:    vadd.vx v6, v6, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v8, v24, v6
+; CHECK-NEXT:    vrgatherei16.vv v24, v16, v6
 ; CHECK-NEXT:    vmv.v.v v16, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv16f64_nxv8f64:
 ; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 16
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a0, a0, 3
-; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVBB-NEXT:    vmv8r.v v0, v8
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    srli a0, a0, 1
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m2, ta, mu
-; ZVBB-NEXT:    vid.v v24
-; ZVBB-NEXT:    vand.vi v26, v24, 1
-; ZVBB-NEXT:    vmsne.vi v10, v26, 0
-; ZVBB-NEXT:    vsrl.vi v8, v24, 1
-; ZVBB-NEXT:    vmv8r.v v24, v0
-; ZVBB-NEXT:    vmv4r.v v12, v4
-; ZVBB-NEXT:    vmv1r.v v0, v10
-; ZVBB-NEXT:    vadd.vx v8, v8, a0, v0.t
+; ZVBB-NEXT:    vid.v v6
+; ZVBB-NEXT:    vmv8r.v v24, v8
+; ZVBB-NEXT:    srli a0, a0, 1
 ; ZVBB-NEXT:    vmv4r.v v28, v16
-; ZVBB-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; ZVBB-NEXT:    vrgatherei16.vv v0, v24, v8
-; ZVBB-NEXT:    addi a0, sp, 16
-; ZVBB-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; ZVBB-NEXT:    vmv4r.v v16, v12
-; ZVBB-NEXT:    vrgatherei16.vv v24, v16, v8
-; ZVBB-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVBB-NEXT:    vand.vi v8, v6, 1
+; ZVBB-NEXT:    vmsne.vi v0, v8, 0
+; ZVBB-NEXT:    vsrl.vi v6, v6, 1
+; ZVBB-NEXT:    vadd.vx v6, v6, a0, v0.t
+; ZVBB-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; ZVBB-NEXT:    vrgatherei16.vv v8, v24, v6
+; ZVBB-NEXT:    vrgatherei16.vv v24, v16, v6
 ; ZVBB-NEXT:    vmv.v.v v16, v24
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a0, a0, 3
-; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    .cfi_def_cfa sp, 16
-; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 0
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 16 x double> @llvm.vector.interleave2.nxv16f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b)
   ret <vscale x 16 x double> %res

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
index 5460caea196cf8..6a72043ca7e8e6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
@@ -12,18 +12,18 @@ define <vscale x 1 x i1> @splice_nxv1i1_offset_negone(<vscale x 1 x i1> %a, <vsc
 ; CHECK-LABEL: splice_nxv1i1_offset_negone:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vmerge.vim v10, v8, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vslidedown.vx v9, v9, a0
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
-; CHECK-NEXT:    vand.vi v8, v9, 1
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, i32 -1)
@@ -34,20 +34,20 @@ define <vscale x 1 x i1> @splice_nxv1i1_offset_max(<vscale x 1 x i1> %a, <vscale
 ; CHECK-LABEL: splice_nxv1i1_offset_max:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vmerge.vim v10, v8, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v9, 1
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vslideup.vx v9, v8, a0
-; CHECK-NEXT:    vand.vi v8, v9, 1
+; CHECK-NEXT:    vslideup.vx v8, v10, a0
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> %a, <vscale x 1 x i1> %b, i32 1)
@@ -60,18 +60,18 @@ define <vscale x 2 x i1> @splice_nxv2i1_offset_negone(<vscale x 2 x i1> %a, <vsc
 ; CHECK-LABEL: splice_nxv2i1_offset_negone:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vmerge.vim v10, v8, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vslidedown.vx v9, v9, a0
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
-; CHECK-NEXT:    vand.vi v8, v9, 1
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 -1)
@@ -82,20 +82,20 @@ define <vscale x 2 x i1> @splice_nxv2i1_offset_max(<vscale x 2 x i1> %a, <vscale
 ; CHECK-LABEL: splice_nxv2i1_offset_max:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vmerge.vim v10, v8, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    addi a0, a0, -3
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v9, 3
+; CHECK-NEXT:    vslidedown.vi v8, v8, 3
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslideup.vx v9, v8, a0
-; CHECK-NEXT:    vand.vi v8, v9, 1
+; CHECK-NEXT:    vslideup.vx v8, v10, a0
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 3)
@@ -108,18 +108,18 @@ define <vscale x 4 x i1> @splice_nxv4i1_offset_negone(<vscale x 4 x i1> %a, <vsc
 ; CHECK-LABEL: splice_nxv4i1_offset_negone:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vmerge.vim v10, v8, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vslidedown.vx v9, v9, a0
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
-; CHECK-NEXT:    vand.vi v8, v9, 1
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 -1)
@@ -130,20 +130,20 @@ define <vscale x 4 x i1> @splice_nxv4i1_offset_max(<vscale x 4 x i1> %a, <vscale
 ; CHECK-LABEL: splice_nxv4i1_offset_max:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vmerge.vim v10, v8, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    addi a0, a0, -7
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v9, 7
+; CHECK-NEXT:    vslidedown.vi v8, v8, 7
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v9, v8, a0
-; CHECK-NEXT:    vand.vi v8, v9, 1
+; CHECK-NEXT:    vslideup.vx v8, v10, a0
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 7)
@@ -156,17 +156,17 @@ define <vscale x 8 x i1> @splice_nxv8i1_offset_negone(<vscale x 8 x i1> %a, <vsc
 ; CHECK-LABEL: splice_nxv8i1_offset_negone:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vmerge.vim v10, v8, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vslidedown.vx v9, v9, a0
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
-; CHECK-NEXT:    vand.vi v8, v9, 1
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 -1)
@@ -177,19 +177,19 @@ define <vscale x 8 x i1> @splice_nxv8i1_offset_max(<vscale x 8 x i1> %a, <vscale
 ; CHECK-LABEL: splice_nxv8i1_offset_max:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vmerge.vim v10, v8, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    addi a0, a0, -15
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v9, v9, 15
+; CHECK-NEXT:    vslidedown.vi v8, v8, 15
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v9, v8, a0
-; CHECK-NEXT:    vand.vi v8, v9, 1
+; CHECK-NEXT:    vslideup.vx v8, v10, a0
+; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 15)
@@ -202,13 +202,13 @@ define <vscale x 16 x i1> @splice_nxv16i1_offset_negone(<vscale x 16 x i1> %a, <
 ; CHECK-LABEL: splice_nxv16i1_offset_negone:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
@@ -224,13 +224,13 @@ define <vscale x 16 x i1> @splice_nxv16i1_offset_max(<vscale x 16 x i1> %a, <vsc
 ; CHECK-LABEL: splice_nxv16i1_offset_max:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    addi a0, a0, -31
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
@@ -250,13 +250,13 @@ define <vscale x 32 x i1> @splice_nxv32i1_offset_negone(<vscale x 32 x i1> %a, <
 ; CHECK-LABEL: splice_nxv32i1_offset_negone:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
-; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
@@ -273,11 +273,11 @@ define <vscale x 32 x i1> @splice_nxv32i1_offset_max(<vscale x 32 x i1> %a, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 63
+; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    addi a0, a0, -63
-; CHECK-NEXT:    li a1, 63
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v16, v16, a1
 ; CHECK-NEXT:    vmv1r.v v0, v8
@@ -297,13 +297,13 @@ define <vscale x 64 x i1> @splice_nxv64i1_offset_negone(<vscale x 64 x i1> %a, <
 ; CHECK-LABEL: splice_nxv64i1_offset_negone:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v24, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vmerge.vim v16, v24, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
-; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
@@ -320,11 +320,11 @@ define <vscale x 64 x i1> @splice_nxv64i1_offset_max(<vscale x 64 x i1> %a, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    vmerge.vim v24, v16, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 127
+; CHECK-NEXT:    vmerge.vim v24, v16, 1, v0
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    addi a0, a0, -127
-; CHECK-NEXT:    li a1, 127
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v24, v24, a1
 ; CHECK-NEXT:    vmv1r.v v0, v8

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
index 42b71d412fde4d..2c92a5da8eecb7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfabs-vp.ll
@@ -465,12 +465,12 @@ define <vscale x 16 x double> @vfabs_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB32_2

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll
index c6c92db62bf694..0faebe87f2657d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-constrained-sdnode.ll
@@ -190,19 +190,34 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) strictfp {
 ; CHECK-LABEL: vfadd_vv_nxv32bf16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v24, v0, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfadd.vv v0, v0, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v16, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 entry:
   %vc = call <vscale x 32 x bfloat> @llvm.experimental.constrained.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
@@ -212,22 +227,39 @@ entry:
 define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) strictfp {
 ; CHECK-LABEL: vfadd_vf_nxv32bf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    fmv.x.h a0, fa0
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v24, v24, v0
+; CHECK-NEXT:    vfadd.vv v0, v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24
+; CHECK-NEXT:    vfadd.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
@@ -485,19 +517,34 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv32f16:
 ; ZVFHMIN:       # %bb.0: # %entry
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v24, v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v0, v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
 entry:
   %vc = call <vscale x 32 x half> @llvm.experimental.constrained.fadd.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
@@ -513,22 +560,39 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ;
 ; ZVFHMIN-LABEL: vfadd_vf_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v24, v24, v0
+; ZVFHMIN-NEXT:    vfadd.vv v0, v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
index 53a13b511a799e..19c5ee4a85ed6a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
@@ -204,19 +204,34 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) {
 ; CHECK-LABEL: vfadd_vv_nxv32bf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v24, v0, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfadd.vv v0, v0, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v16, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %vc = fadd <vscale x 32 x bfloat> %va, %vb
   ret <vscale x 32 x bfloat> %vc
@@ -225,22 +240,39 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) {
 ; CHECK-LABEL: vfadd_vf_nxv32bf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    fmv.x.h a0, fa0
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v24, v24, v0
+; CHECK-NEXT:    vfadd.vv v0, v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v20
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v0
+; CHECK-NEXT:    vfadd.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
@@ -512,19 +544,34 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfadd_vv_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v24, v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v0, v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %vc = fadd <vscale x 32 x half> %va, %vb
   ret <vscale x 32 x half> %vc
@@ -539,22 +586,39 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ;
 ; ZVFHMIN-LABEL: vfadd_vf_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v24, v24, v0
+; ZVFHMIN-NEXT:    vfadd.vv v0, v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v20
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v0
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index 31762a7d840ec5..1953cfd2a0169f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -413,20 +413,21 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -464,22 +465,23 @@ define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v7
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -516,23 +518,24 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; CHECK-NEXT:    vmv8r.v v24, v8
+; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v16, a1
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 3
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    slli a3, a1, 3
+; CHECK-NEXT:    add a1, a3, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    csrr a4, vlenb
 ; CHECK-NEXT:    slli a4, a4, 3
 ; CHECK-NEXT:    add a4, sp, a4
@@ -540,18 +543,18 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a4, a2, 3
-; CHECK-NEXT:    add a2, a4, a2
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a4, a3, 3
+; CHECK-NEXT:    add a3, a4, a3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -603,31 +606,37 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v7
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vmv.v.x v16, a1
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vmv4r.v v16, v8
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfadd.vv v16, v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    bltu a0, a1, .LBB25_2
@@ -1198,20 +1207,21 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1255,22 +1265,23 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1313,23 +1324,24 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v24, v8
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    slli a3, a1, 3
+; ZVFHMIN-NEXT:    add a1, a3, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    add a4, sp, a4
@@ -1337,18 +1349,18 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 3
-; ZVFHMIN-NEXT:    add a2, a4, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a4, a3, 3
+; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1406,31 +1418,37 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vmv4r.v v16, v8
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB51_2

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfcmp-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfcmp-constrained-sdnode.ll
index 21c5f757e4558d..ec6ab422d64058 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfcmp-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfcmp-constrained-sdnode.ll
@@ -57,8 +57,8 @@ define <vscale x 1 x i1> @fcmp_ogt_vf_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -73,8 +73,8 @@ define <vscale x 1 x i1> @fcmp_ogt_fv_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -102,8 +102,8 @@ define <vscale x 1 x i1> @fcmp_oge_vf_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -118,8 +118,8 @@ define <vscale x 1 x i1> @fcmp_oge_fv_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -147,8 +147,8 @@ define <vscale x 1 x i1> @fcmp_olt_vf_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -163,8 +163,8 @@ define <vscale x 1 x i1> @fcmp_olt_fv_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -192,8 +192,8 @@ define <vscale x 1 x i1> @fcmp_ole_vf_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -208,8 +208,8 @@ define <vscale x 1 x i1> @fcmp_ole_fv_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -240,8 +240,8 @@ define <vscale x 1 x i1> @fcmp_one_vf_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -259,8 +259,8 @@ define <vscale x 1 x i1> @fcmp_one_fv_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -336,8 +336,8 @@ define <vscale x 1 x i1> @fcmp_ueq_vf_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -355,8 +355,8 @@ define <vscale x 1 x i1> @fcmp_ueq_fv_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -388,8 +388,8 @@ define <vscale x 1 x i1> @fcmp_ugt_vf_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -405,8 +405,8 @@ define <vscale x 1 x i1> @fcmp_ugt_fv_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -436,8 +436,8 @@ define <vscale x 1 x i1> @fcmp_uge_vf_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -453,8 +453,8 @@ define <vscale x 1 x i1> @fcmp_uge_fv_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -484,8 +484,8 @@ define <vscale x 1 x i1> @fcmp_ult_vf_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -501,8 +501,8 @@ define <vscale x 1 x i1> @fcmp_ult_fv_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -532,8 +532,8 @@ define <vscale x 1 x i1> @fcmp_ule_vf_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -549,8 +549,8 @@ define <vscale x 1 x i1> @fcmp_ule_fv_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -690,8 +690,8 @@ define <vscale x 2 x i1> @fcmp_ogt_vf_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -706,8 +706,8 @@ define <vscale x 2 x i1> @fcmp_ogt_fv_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -735,8 +735,8 @@ define <vscale x 2 x i1> @fcmp_oge_vf_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -751,8 +751,8 @@ define <vscale x 2 x i1> @fcmp_oge_fv_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -780,8 +780,8 @@ define <vscale x 2 x i1> @fcmp_olt_vf_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -796,8 +796,8 @@ define <vscale x 2 x i1> @fcmp_olt_fv_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -825,8 +825,8 @@ define <vscale x 2 x i1> @fcmp_ole_vf_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -841,8 +841,8 @@ define <vscale x 2 x i1> @fcmp_ole_fv_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -873,8 +873,8 @@ define <vscale x 2 x i1> @fcmp_one_vf_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -892,8 +892,8 @@ define <vscale x 2 x i1> @fcmp_one_fv_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -969,8 +969,8 @@ define <vscale x 2 x i1> @fcmp_ueq_vf_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -988,8 +988,8 @@ define <vscale x 2 x i1> @fcmp_ueq_fv_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -1021,8 +1021,8 @@ define <vscale x 2 x i1> @fcmp_ugt_vf_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1038,8 +1038,8 @@ define <vscale x 2 x i1> @fcmp_ugt_fv_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1069,8 +1069,8 @@ define <vscale x 2 x i1> @fcmp_uge_vf_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1086,8 +1086,8 @@ define <vscale x 2 x i1> @fcmp_uge_fv_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1117,8 +1117,8 @@ define <vscale x 2 x i1> @fcmp_ult_vf_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1134,8 +1134,8 @@ define <vscale x 2 x i1> @fcmp_ult_fv_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1165,8 +1165,8 @@ define <vscale x 2 x i1> @fcmp_ule_vf_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1182,8 +1182,8 @@ define <vscale x 2 x i1> @fcmp_ule_fv_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1323,8 +1323,8 @@ define <vscale x 4 x i1> @fcmp_ogt_vf_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1339,8 +1339,8 @@ define <vscale x 4 x i1> @fcmp_ogt_fv_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1368,8 +1368,8 @@ define <vscale x 4 x i1> @fcmp_oge_vf_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1384,8 +1384,8 @@ define <vscale x 4 x i1> @fcmp_oge_fv_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1413,8 +1413,8 @@ define <vscale x 4 x i1> @fcmp_olt_vf_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1429,8 +1429,8 @@ define <vscale x 4 x i1> @fcmp_olt_fv_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1458,8 +1458,8 @@ define <vscale x 4 x i1> @fcmp_ole_vf_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1474,8 +1474,8 @@ define <vscale x 4 x i1> @fcmp_ole_fv_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -1506,8 +1506,8 @@ define <vscale x 4 x i1> @fcmp_one_vf_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -1525,8 +1525,8 @@ define <vscale x 4 x i1> @fcmp_one_fv_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -1602,8 +1602,8 @@ define <vscale x 4 x i1> @fcmp_ueq_vf_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -1621,8 +1621,8 @@ define <vscale x 4 x i1> @fcmp_ueq_fv_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -1654,8 +1654,8 @@ define <vscale x 4 x i1> @fcmp_ugt_vf_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1671,8 +1671,8 @@ define <vscale x 4 x i1> @fcmp_ugt_fv_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1702,8 +1702,8 @@ define <vscale x 4 x i1> @fcmp_uge_vf_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1719,8 +1719,8 @@ define <vscale x 4 x i1> @fcmp_uge_fv_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1750,8 +1750,8 @@ define <vscale x 4 x i1> @fcmp_ult_vf_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1767,8 +1767,8 @@ define <vscale x 4 x i1> @fcmp_ult_fv_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1798,8 +1798,8 @@ define <vscale x 4 x i1> @fcmp_ule_vf_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -1815,8 +1815,8 @@ define <vscale x 4 x i1> @fcmp_ule_fv_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -2164,9 +2164,9 @@ define <vscale x 8 x i1> @fcmp_one_vf_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v12, v13
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
@@ -2184,9 +2184,9 @@ define <vscale x 8 x i1> @fcmp_one_fv_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v13, v12
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
@@ -2263,9 +2263,9 @@ define <vscale x 8 x i1> @fcmp_ueq_vf_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v12, v13
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
@@ -2283,9 +2283,9 @@ define <vscale x 8 x i1> @fcmp_ueq_fv_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v13, v12
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
@@ -3981,8 +3981,8 @@ define <vscale x 1 x i1> @fcmp_ogt_vf_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -3997,8 +3997,8 @@ define <vscale x 1 x i1> @fcmp_ogt_fv_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4026,8 +4026,8 @@ define <vscale x 1 x i1> @fcmp_oge_vf_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4042,8 +4042,8 @@ define <vscale x 1 x i1> @fcmp_oge_fv_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4071,8 +4071,8 @@ define <vscale x 1 x i1> @fcmp_olt_vf_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4087,8 +4087,8 @@ define <vscale x 1 x i1> @fcmp_olt_fv_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4116,8 +4116,8 @@ define <vscale x 1 x i1> @fcmp_ole_vf_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4132,8 +4132,8 @@ define <vscale x 1 x i1> @fcmp_ole_fv_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4164,8 +4164,8 @@ define <vscale x 1 x i1> @fcmp_one_vf_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -4183,8 +4183,8 @@ define <vscale x 1 x i1> @fcmp_one_fv_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -4260,8 +4260,8 @@ define <vscale x 1 x i1> @fcmp_ueq_vf_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -4279,8 +4279,8 @@ define <vscale x 1 x i1> @fcmp_ueq_fv_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -4312,8 +4312,8 @@ define <vscale x 1 x i1> @fcmp_ugt_vf_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4329,8 +4329,8 @@ define <vscale x 1 x i1> @fcmp_ugt_fv_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4360,8 +4360,8 @@ define <vscale x 1 x i1> @fcmp_uge_vf_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4377,8 +4377,8 @@ define <vscale x 1 x i1> @fcmp_uge_fv_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4408,8 +4408,8 @@ define <vscale x 1 x i1> @fcmp_ult_vf_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4425,8 +4425,8 @@ define <vscale x 1 x i1> @fcmp_ult_fv_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4456,8 +4456,8 @@ define <vscale x 1 x i1> @fcmp_ule_vf_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4473,8 +4473,8 @@ define <vscale x 1 x i1> @fcmp_ule_fv_nxv1f32(<vscale x 1 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4614,8 +4614,8 @@ define <vscale x 2 x i1> @fcmp_ogt_vf_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4630,8 +4630,8 @@ define <vscale x 2 x i1> @fcmp_ogt_fv_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4659,8 +4659,8 @@ define <vscale x 2 x i1> @fcmp_oge_vf_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4675,8 +4675,8 @@ define <vscale x 2 x i1> @fcmp_oge_fv_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4704,8 +4704,8 @@ define <vscale x 2 x i1> @fcmp_olt_vf_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4720,8 +4720,8 @@ define <vscale x 2 x i1> @fcmp_olt_fv_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4749,8 +4749,8 @@ define <vscale x 2 x i1> @fcmp_ole_vf_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4765,8 +4765,8 @@ define <vscale x 2 x i1> @fcmp_ole_fv_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -4797,8 +4797,8 @@ define <vscale x 2 x i1> @fcmp_one_vf_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -4816,8 +4816,8 @@ define <vscale x 2 x i1> @fcmp_one_fv_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -4893,8 +4893,8 @@ define <vscale x 2 x i1> @fcmp_ueq_vf_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -4912,8 +4912,8 @@ define <vscale x 2 x i1> @fcmp_ueq_fv_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -4945,8 +4945,8 @@ define <vscale x 2 x i1> @fcmp_ugt_vf_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4962,8 +4962,8 @@ define <vscale x 2 x i1> @fcmp_ugt_fv_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -4993,8 +4993,8 @@ define <vscale x 2 x i1> @fcmp_uge_vf_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5010,8 +5010,8 @@ define <vscale x 2 x i1> @fcmp_uge_fv_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5041,8 +5041,8 @@ define <vscale x 2 x i1> @fcmp_ult_vf_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5058,8 +5058,8 @@ define <vscale x 2 x i1> @fcmp_ult_fv_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5089,8 +5089,8 @@ define <vscale x 2 x i1> @fcmp_ule_vf_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5106,8 +5106,8 @@ define <vscale x 2 x i1> @fcmp_ule_fv_nxv2f32(<vscale x 2 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -5455,9 +5455,9 @@ define <vscale x 4 x i1> @fcmp_one_vf_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v12, v13
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
@@ -5475,9 +5475,9 @@ define <vscale x 4 x i1> @fcmp_one_fv_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v13, v12
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
@@ -5554,9 +5554,9 @@ define <vscale x 4 x i1> @fcmp_ueq_vf_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v12, v13
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
@@ -5574,9 +5574,9 @@ define <vscale x 4 x i1> @fcmp_ueq_fv_nxv4f32(<vscale x 4 x float> %va, float %b
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v13, v12
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
@@ -7272,8 +7272,8 @@ define <vscale x 1 x i1> @fcmp_ogt_vf_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7288,8 +7288,8 @@ define <vscale x 1 x i1> @fcmp_ogt_fv_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7317,8 +7317,8 @@ define <vscale x 1 x i1> @fcmp_oge_vf_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7333,8 +7333,8 @@ define <vscale x 1 x i1> @fcmp_oge_fv_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7362,8 +7362,8 @@ define <vscale x 1 x i1> @fcmp_olt_vf_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7378,8 +7378,8 @@ define <vscale x 1 x i1> @fcmp_olt_fv_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7407,8 +7407,8 @@ define <vscale x 1 x i1> @fcmp_ole_vf_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7423,8 +7423,8 @@ define <vscale x 1 x i1> @fcmp_ole_fv_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    ret
@@ -7455,8 +7455,8 @@ define <vscale x 1 x i1> @fcmp_one_vf_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -7474,8 +7474,8 @@ define <vscale x 1 x i1> @fcmp_one_fv_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -7551,8 +7551,8 @@ define <vscale x 1 x i1> @fcmp_ueq_vf_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmflt.vf v9, v8, fa0, v0.t
@@ -7570,8 +7570,8 @@ define <vscale x 1 x i1> @fcmp_ueq_fv_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmv.v.v v9, v0
 ; CHECK-NEXT:    vmfgt.vf v9, v8, fa0, v0.t
@@ -7603,8 +7603,8 @@ define <vscale x 1 x i1> @fcmp_ugt_vf_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7620,8 +7620,8 @@ define <vscale x 1 x i1> @fcmp_ugt_fv_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7651,8 +7651,8 @@ define <vscale x 1 x i1> @fcmp_uge_vf_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7668,8 +7668,8 @@ define <vscale x 1 x i1> @fcmp_uge_fv_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7699,8 +7699,8 @@ define <vscale x 1 x i1> @fcmp_ult_vf_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfge.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7716,8 +7716,8 @@ define <vscale x 1 x i1> @fcmp_ult_fv_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmfle.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7747,8 +7747,8 @@ define <vscale x 1 x i1> @fcmp_ule_vf_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v9, v10
 ; CHECK-NEXT:    vmfgt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -7764,8 +7764,8 @@ define <vscale x 1 x i1> @fcmp_ule_fv_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfeq.vv v10, v8, v8
+; CHECK-NEXT:    vmfeq.vf v9, v9, fa0
 ; CHECK-NEXT:    vmand.mm v0, v10, v9
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa0, v0.t
 ; CHECK-NEXT:    vmnot.m v0, v0
@@ -8113,9 +8113,9 @@ define <vscale x 2 x i1> @fcmp_one_vf_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v12, v13
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
@@ -8133,9 +8133,9 @@ define <vscale x 2 x i1> @fcmp_one_fv_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v13, v12
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t
@@ -8212,9 +8212,9 @@ define <vscale x 2 x i1> @fcmp_ueq_vf_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v10, v12
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v12, v13
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmflt.vf v11, v8, fa0, v0.t
@@ -8232,9 +8232,9 @@ define <vscale x 2 x i1> @fcmp_ueq_fv_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfeq.vf v12, v10, fa0
-; CHECK-NEXT:    vmfeq.vv v10, v8, v8
-; CHECK-NEXT:    vmand.mm v10, v12, v10
+; CHECK-NEXT:    vmfeq.vv v12, v8, v8
+; CHECK-NEXT:    vmfeq.vf v13, v10, fa0
+; CHECK-NEXT:    vmand.mm v10, v13, v12
 ; CHECK-NEXT:    vmv1r.v v11, v10
 ; CHECK-NEXT:    vmv1r.v v0, v10
 ; CHECK-NEXT:    vmfgt.vf v11, v8, fa0, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfcmps-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfcmps-constrained-sdnode.ll
index 56284d90a146b3..2ca9dd24e915ad 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfcmps-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfcmps-constrained-sdnode.ll
@@ -509,8 +509,8 @@ define <vscale x 1 x i1> @fcmps_uno_vf_nxv1f16(<vscale x 1 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -1041,8 +1041,8 @@ define <vscale x 2 x i1> @fcmps_uno_vf_nxv2f16(<vscale x 2 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -1573,8 +1573,8 @@ define <vscale x 4 x i1> @fcmps_uno_vf_nxv4f16(<vscale x 4 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -2105,10 +2105,10 @@ define <vscale x 8 x i1> @fcmps_uno_vf_nxv8f16(<vscale x 8 x half> %va, half %b)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfle.vf v12, v10, fa0
-; CHECK-NEXT:    vmfle.vv v10, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v10
-; CHECK-NEXT:    vmorn.mm v0, v8, v12
+; CHECK-NEXT:    vmfle.vv v12, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v10, fa0
+; CHECK-NEXT:    vmnot.m v9, v12
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -2637,10 +2637,10 @@ define <vscale x 16 x i1> @fcmps_uno_vf_nxv16f16(<vscale x 16 x half> %va, half
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
-; CHECK-NEXT:    vmfle.vf v16, v12, fa0
-; CHECK-NEXT:    vmfle.vv v12, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v12
-; CHECK-NEXT:    vmorn.mm v0, v8, v16
+; CHECK-NEXT:    vmfle.vv v16, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v12, fa0
+; CHECK-NEXT:    vmnot.m v9, v16
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -3169,10 +3169,10 @@ define <vscale x 32 x i1> @fcmps_uno_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
-; CHECK-NEXT:    vmfle.vf v24, v16, fa0
-; CHECK-NEXT:    vmfle.vv v16, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v16
-; CHECK-NEXT:    vmorn.mm v0, v8, v24
+; CHECK-NEXT:    vmfle.vv v24, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v16, fa0
+; CHECK-NEXT:    vmnot.m v9, v24
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
@@ -3701,8 +3701,8 @@ define <vscale x 1 x i1> @fcmps_uno_vf_nxv1f32(<vscale x 1 x float> %va, float %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -4233,8 +4233,8 @@ define <vscale x 2 x i1> @fcmps_uno_vf_nxv2f32(<vscale x 2 x float> %va, float %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -4765,10 +4765,10 @@ define <vscale x 4 x i1> @fcmps_uno_vf_nxv4f32(<vscale x 4 x float> %va, float %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfle.vf v12, v10, fa0
-; CHECK-NEXT:    vmfle.vv v10, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v10
-; CHECK-NEXT:    vmorn.mm v0, v8, v12
+; CHECK-NEXT:    vmfle.vv v12, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v10, fa0
+; CHECK-NEXT:    vmnot.m v9, v12
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 4 x float> %head, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
@@ -5297,10 +5297,10 @@ define <vscale x 8 x i1> @fcmps_uno_vf_nxv8f32(<vscale x 8 x float> %va, float %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
-; CHECK-NEXT:    vmfle.vf v16, v12, fa0
-; CHECK-NEXT:    vmfle.vv v12, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v12
-; CHECK-NEXT:    vmorn.mm v0, v8, v16
+; CHECK-NEXT:    vmfle.vv v16, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v12, fa0
+; CHECK-NEXT:    vmnot.m v9, v16
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 8 x float> %head, <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
@@ -5829,10 +5829,10 @@ define <vscale x 16 x i1> @fcmps_uno_vf_nxv16f32(<vscale x 16 x float> %va, floa
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
-; CHECK-NEXT:    vmfle.vf v24, v16, fa0
-; CHECK-NEXT:    vmfle.vv v16, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v16
-; CHECK-NEXT:    vmorn.mm v0, v8, v24
+; CHECK-NEXT:    vmfle.vv v24, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v16, fa0
+; CHECK-NEXT:    vmnot.m v9, v24
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x float> poison, float %b, i32 0
   %splat = shufflevector <vscale x 16 x float> %head, <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer
@@ -6361,8 +6361,8 @@ define <vscale x 1 x i1> @fcmps_uno_vf_nxv1f64(<vscale x 1 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v9, fa0
-; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmfle.vv v8, v8, v8
+; CHECK-NEXT:    vmfle.vf v9, v9, fa0
 ; CHECK-NEXT:    vmnot.m v8, v8
 ; CHECK-NEXT:    vmorn.mm v0, v8, v9
 ; CHECK-NEXT:    ret
@@ -6893,10 +6893,10 @@ define <vscale x 2 x i1> @fcmps_uno_vf_nxv2f64(<vscale x 2 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v10, fa0
-; CHECK-NEXT:    vmfle.vf v12, v10, fa0
-; CHECK-NEXT:    vmfle.vv v10, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v10
-; CHECK-NEXT:    vmorn.mm v0, v8, v12
+; CHECK-NEXT:    vmfle.vv v12, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v10, fa0
+; CHECK-NEXT:    vmnot.m v9, v12
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 2 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 2 x double> %head, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
@@ -7425,10 +7425,10 @@ define <vscale x 4 x i1> @fcmps_uno_vf_nxv4f64(<vscale x 4 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v12, fa0
-; CHECK-NEXT:    vmfle.vf v16, v12, fa0
-; CHECK-NEXT:    vmfle.vv v12, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v12
-; CHECK-NEXT:    vmorn.mm v0, v8, v16
+; CHECK-NEXT:    vmfle.vv v16, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v12, fa0
+; CHECK-NEXT:    vmnot.m v9, v16
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 4 x double> %head, <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
@@ -7957,10 +7957,10 @@ define <vscale x 8 x i1> @fcmps_uno_vf_nxv8f64(<vscale x 8 x double> %va, double
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmv.v.f v16, fa0
-; CHECK-NEXT:    vmfle.vf v24, v16, fa0
-; CHECK-NEXT:    vmfle.vv v16, v8, v8
-; CHECK-NEXT:    vmnot.m v8, v16
-; CHECK-NEXT:    vmorn.mm v0, v8, v24
+; CHECK-NEXT:    vmfle.vv v24, v8, v8
+; CHECK-NEXT:    vmfle.vf v8, v16, fa0
+; CHECK-NEXT:    vmnot.m v9, v24
+; CHECK-NEXT:    vmorn.mm v0, v9, v8
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x double> poison, double %b, i32 0
   %splat = shufflevector <vscale x 8 x double> %head, <vscale x 8 x double> poison, <vscale x 8 x i32> zeroinitializer

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll
index b28981ff196ad7..beb56a2645a1c7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfcopysign-sdnode.ll
@@ -128,12 +128,12 @@ define <vscale x 1 x half> @vfcopysign_vf_nxv1f16(<vscale x 1 x half> %vm, half
 ; ZVFHMIN-LABEL: vfcopysign_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a0
-; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    addi a1, a0, -1
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    addi a0, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 1 x half> poison, half %s, i32 0
@@ -174,13 +174,13 @@ define <vscale x 1 x half> @vfcopynsign_vf_nxv1f16(<vscale x 1 x half> %vm, half
 ; ZVFHMIN-LABEL: vfcopynsign_vf_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a0
-; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
-; ZVFHMIN-NEXT:    addi a1, a0, -1
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    addi a0, a1, -1
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 1 x half> poison, half %s, i32 0
@@ -227,9 +227,9 @@ define <vscale x 1 x half> @vfcopysign_exttrunc_vf_nxv1f16_nxv1f32(<vscale x 1 x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.v.f v9, fa0
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
-; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    addi a1, a0, -1
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vand.vx v9, v10, a0
@@ -253,11 +253,11 @@ define <vscale x 1 x half> @vfcopynsign_exttrunc_vv_nxv1f16_nxv1f32(<vscale x 1
 ; ZVFHMIN-LABEL: vfcopynsign_exttrunc_vv_nxv1f16_nxv1f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    addi a1, a0, -1
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    addi a1, a0, -1
 ; ZVFHMIN-NEXT:    vxor.vx v9, v10, a0
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -284,8 +284,8 @@ define <vscale x 1 x half> @vfcopynsign_exttrunc_vf_nxv1f16_nxv1f32(<vscale x 1
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    addi a1, a0, -1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v9
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vxor.vx v9, v10, a0
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
@@ -312,9 +312,9 @@ define <vscale x 1 x half> @vfcopysign_exttrunc_vv_nxv1f16_nxv1f64(<vscale x 1 x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v10, v9
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
-; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    addi a1, a0, -1
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
@@ -341,12 +341,12 @@ define <vscale x 1 x half> @vfcopysign_exttrunc_vf_nxv1f16_nxv1f64(<vscale x 1 x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.v.f v9, fa0
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v10, v9
+; ZVFHMIN-NEXT:    addi a1, a0, -1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
-; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    addi a1, a0, -1
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
@@ -371,13 +371,12 @@ define <vscale x 1 x half> @vfcopynsign_exttrunc_vv_nxv1f16_nxv1f64(<vscale x 1
 ; ZVFHMIN-LABEL: vfcopynsign_exttrunc_vv_nxv1f16_nxv1f64:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    addi a1, a0, -1
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v10, v9
+; ZVFHMIN-NEXT:    addi a1, a0, -1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
@@ -406,11 +405,10 @@ define <vscale x 1 x half> @vfcopynsign_exttrunc_vf_nxv1f16_nxv1f64(<vscale x 1
 ; ZVFHMIN-NEXT:    vfmv.v.f v9, fa0
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    addi a1, a0, -1
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v10, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v9, v10
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
 ; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
@@ -456,12 +454,12 @@ define <vscale x 2 x half> @vfcopysign_vf_nxv2f16(<vscale x 2 x half> %vm, half
 ; ZVFHMIN-LABEL: vfcopysign_vf_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a0
-; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    addi a1, a0, -1
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    addi a0, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 2 x half> poison, half %s, i32 0
@@ -502,13 +500,13 @@ define <vscale x 2 x half> @vfcopynsign_vf_nxv2f16(<vscale x 2 x half> %vm, half
 ; ZVFHMIN-LABEL: vfcopynsign_vf_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a0
-; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
-; ZVFHMIN-NEXT:    addi a1, a0, -1
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    addi a0, a1, -1
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 2 x half> poison, half %s, i32 0
@@ -550,12 +548,12 @@ define <vscale x 4 x half> @vfcopysign_vf_nxv4f16(<vscale x 4 x half> %vm, half
 ; ZVFHMIN-LABEL: vfcopysign_vf_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a0
-; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    addi a1, a0, -1
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    addi a0, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 4 x half> poison, half %s, i32 0
@@ -596,13 +594,13 @@ define <vscale x 4 x half> @vfcopynsign_vf_nxv4f16(<vscale x 4 x half> %vm, half
 ; ZVFHMIN-LABEL: vfcopynsign_vf_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v9, a0
-; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
-; ZVFHMIN-NEXT:    addi a1, a0, -1
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vand.vx v9, v9, a0
+; ZVFHMIN-NEXT:    addi a0, a1, -1
+; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vand.vx v9, v9, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v9
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 4 x half> poison, half %s, i32 0
@@ -644,12 +642,12 @@ define <vscale x 8 x half> @vfcopysign_vf_nxv8f16(<vscale x 8 x half> %vm, half
 ; ZVFHMIN-LABEL: vfcopysign_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a0
-; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    addi a1, a0, -1
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
+; ZVFHMIN-NEXT:    addi a0, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %s, i32 0
@@ -690,13 +688,13 @@ define <vscale x 8 x half> @vfcopynsign_vf_nxv8f16(<vscale x 8 x half> %vm, half
 ; ZVFHMIN-LABEL: vfcopynsign_vf_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a0
-; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
-; ZVFHMIN-NEXT:    addi a1, a0, -1
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
+; ZVFHMIN-NEXT:    addi a0, a1, -1
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vand.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %s, i32 0
@@ -743,9 +741,9 @@ define <vscale x 8 x half> @vfcopysign_exttrunc_vf_nxv8f16_nxv8f32(<vscale x 8 x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.v.f v12, fa0
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
-; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    addi a1, a0, -1
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
@@ -769,11 +767,11 @@ define <vscale x 8 x half> @vfcopynsign_exttrunc_vv_nxv8f16_nxv8f32(<vscale x 8
 ; ZVFHMIN-LABEL: vfcopynsign_exttrunc_vv_nxv8f16_nxv8f32:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    addi a1, a0, -1
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    addi a1, a0, -1
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
 ; ZVFHMIN-NEXT:    ret
@@ -800,8 +798,8 @@ define <vscale x 8 x half> @vfcopynsign_exttrunc_vf_nxv8f16_nxv8f32(<vscale x 8
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    addi a1, a0, -1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
@@ -828,9 +826,9 @@ define <vscale x 8 x half> @vfcopysign_exttrunc_vv_nxv8f16_nxv8f64(<vscale x 8 x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v12, v16
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
-; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    addi a1, a0, -1
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
@@ -857,12 +855,12 @@ define <vscale x 8 x half> @vfcopysign_exttrunc_vf_nxv8f16_nxv8f64(<vscale x 8 x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.v.f v16, fa0
+; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v12, v16
+; ZVFHMIN-NEXT:    addi a1, a0, -1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
-; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    addi a1, a0, -1
 ; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
@@ -887,13 +885,12 @@ define <vscale x 8 x half> @vfcopynsign_exttrunc_vv_nxv8f16_nxv8f64(<vscale x 8
 ; ZVFHMIN-LABEL: vfcopynsign_exttrunc_vv_nxv8f16_nxv8f64:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    addi a1, a0, -1
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v12, v16
+; ZVFHMIN-NEXT:    addi a1, a0, -1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v10
@@ -922,11 +919,10 @@ define <vscale x 8 x half> @vfcopynsign_exttrunc_vf_nxv8f16_nxv8f64(<vscale x 8
 ; ZVFHMIN-NEXT:    vfmv.v.f v16, fa0
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    addi a1, a0, -1
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.rod.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v10, v12
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vand.vx v10, v10, a0
@@ -972,12 +968,12 @@ define <vscale x 16 x half> @vfcopysign_vf_nxv16f16(<vscale x 16 x half> %vm, ha
 ; ZVFHMIN-LABEL: vfcopysign_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a0
-; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    addi a1, a0, -1
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vand.vx v12, v12, a0
+; ZVFHMIN-NEXT:    addi a0, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vand.vx v12, v12, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v12
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %s, i32 0
@@ -1018,13 +1014,13 @@ define <vscale x 16 x half> @vfcopynsign_vf_nxv16f16(<vscale x 16 x half> %vm, h
 ; ZVFHMIN-LABEL: vfcopynsign_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a0
-; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
-; ZVFHMIN-NEXT:    addi a1, a0, -1
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vand.vx v12, v12, a0
+; ZVFHMIN-NEXT:    addi a0, a1, -1
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vand.vx v12, v12, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v12
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %s, i32 0
@@ -1066,12 +1062,12 @@ define <vscale x 32 x half> @vfcopysign_vf_nxv32f16(<vscale x 32 x half> %vm, ha
 ; ZVFHMIN-LABEL: vfcopysign_vf_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a0
-; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    addi a1, a0, -1
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vand.vx v16, v16, a0
+; ZVFHMIN-NEXT:    addi a0, a1, -1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vand.vx v16, v16, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v16
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %s, i32 0
@@ -1112,13 +1108,13 @@ define <vscale x 32 x half> @vfcopynsign_vf_nxv32f16(<vscale x 32 x half> %vm, h
 ; ZVFHMIN-LABEL: vfcopynsign_vf_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a0
-; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0
-; ZVFHMIN-NEXT:    addi a1, a0, -1
-; ZVFHMIN-NEXT:    vand.vx v8, v8, a1
-; ZVFHMIN-NEXT:    vand.vx v16, v16, a0
+; ZVFHMIN-NEXT:    addi a0, a1, -1
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
+; ZVFHMIN-NEXT:    vand.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vand.vx v16, v16, a1
 ; ZVFHMIN-NEXT:    vor.vv v8, v8, v16
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %s, i32 0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll
index ab517de846b0f4..07750623dd44b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll
@@ -209,19 +209,34 @@ define <vscale x 16 x bfloat> @vfdiv_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) strictfp {
 ; CHECK-LABEL: vfdiv_vv_nxv32bf16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v24, v0, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfdiv.vv v0, v0, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v16, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 entry:
   %vc = call <vscale x 32 x bfloat> @llvm.experimental.constrained.fdiv.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
@@ -231,22 +246,39 @@ entry:
 define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) strictfp {
 ; CHECK-LABEL: vfdiv_vf_nxv32bf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    fmv.x.h a0, fa0
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v24, v24, v0
+; CHECK-NEXT:    vfdiv.vv v0, v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v16, v24
+; CHECK-NEXT:    vfdiv.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
@@ -529,19 +561,34 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv32f16:
 ; ZVFHMIN:       # %bb.0: # %entry
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v24, v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfdiv.vv v0, v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
 entry:
   %vc = call <vscale x 32 x half> @llvm.experimental.constrained.fdiv.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
@@ -557,22 +604,39 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ;
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v24, v24, v0
+; ZVFHMIN-NEXT:    vfdiv.vv v0, v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll
index 8d0c3bcf167561..b4a9b1fe3fcf8c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-sdnode.ll
@@ -200,19 +200,34 @@ define <vscale x 16 x bfloat> @vfdiv_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) {
 ; CHECK-LABEL: vfdiv_vv_nxv32bf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v24, v0, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfdiv.vv v0, v0, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v16, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %vc = fdiv <vscale x 32 x bfloat> %va, %vb
   ret <vscale x 32 x bfloat> %vc
@@ -221,22 +236,39 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) {
 ; CHECK-LABEL: vfdiv_vf_nxv32bf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    fmv.x.h a0, fa0
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v24, v24, v0
+; CHECK-NEXT:    vfdiv.vv v0, v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v20
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v24, v0
+; CHECK-NEXT:    vfdiv.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
@@ -508,19 +540,34 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfdiv_vv_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v24, v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfdiv.vv v0, v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %vc = fdiv <vscale x 32 x half> %va, %vb
   ret <vscale x 32 x half> %vc
@@ -535,22 +582,39 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ;
 ; ZVFHMIN-LABEL: vfdiv_vf_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v24, v24, v0
+; ZVFHMIN-NEXT:    vfdiv.vv v0, v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v20
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v0
+; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
index 28e71e845a409e..ccd286b7ee5fd3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
@@ -375,20 +375,21 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -426,22 +427,23 @@ define <vscale x 32 x bfloat> @vfdiv_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v7
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -478,23 +480,24 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; CHECK-NEXT:    vmv8r.v v24, v8
+; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v16, a1
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 3
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    slli a3, a1, 3
+; CHECK-NEXT:    add a1, a3, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    csrr a4, vlenb
 ; CHECK-NEXT:    slli a4, a4, 3
 ; CHECK-NEXT:    add a4, sp, a4
@@ -502,18 +505,18 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a4, a2, 3
-; CHECK-NEXT:    add a2, a4, a2
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a4, a3, 3
+; CHECK-NEXT:    add a3, a4, a3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -565,31 +568,37 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v7
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vmv.v.x v16, a1
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vmv4r.v v16, v8
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfdiv.vv v16, v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    bltu a0, a1, .LBB23_2
@@ -1110,20 +1119,21 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1167,22 +1177,23 @@ define <vscale x 32 x half> @vfdiv_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1225,23 +1236,24 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v24, v8
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    slli a3, a1, 3
+; ZVFHMIN-NEXT:    add a1, a3, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    add a4, sp, a4
@@ -1249,18 +1261,18 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 3
-; ZVFHMIN-NEXT:    add a2, a4, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a4, a3, 3
+; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1318,31 +1330,37 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vmv4r.v v16, v8
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfdiv.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB47_2

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfirst-byte-compare-index.ll b/llvm/test/CodeGen/RISCV/rvv/vfirst-byte-compare-index.ll
index 3107d4e044cae1..abfb652f2206e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfirst-byte-compare-index.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfirst-byte-compare-index.ll
@@ -6,35 +6,35 @@
 define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 signext %len, i32 signext %n) {
 ; CHECK-LABEL: compare_bytes_simple:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addiw a4, a2, 1
-; CHECK-NEXT:    bltu a3, a4, .LBB0_7
+; CHECK-NEXT:    addiw a5, a2, 1
+; CHECK-NEXT:    bltu a3, a5, .LBB0_7
 ; CHECK-NEXT:  # %bb.1: # %mismatch_mem_check
-; CHECK-NEXT:    slli a2, a4, 32
+; CHECK-NEXT:    slli a2, a5, 32
+; CHECK-NEXT:    slli a4, a3, 32
 ; CHECK-NEXT:    srli a2, a2, 32
-; CHECK-NEXT:    slli a5, a3, 32
-; CHECK-NEXT:    srli a5, a5, 32
+; CHECK-NEXT:    srli a4, a4, 32
 ; CHECK-NEXT:    add a6, a0, a2
-; CHECK-NEXT:    add a7, a0, a5
+; CHECK-NEXT:    add a7, a0, a4
 ; CHECK-NEXT:    srli a6, a6, 12
 ; CHECK-NEXT:    srli a7, a7, 12
 ; CHECK-NEXT:    bne a6, a7, .LBB0_7
 ; CHECK-NEXT:  # %bb.2: # %mismatch_mem_check
 ; CHECK-NEXT:    add a6, a1, a2
-; CHECK-NEXT:    add a7, a1, a5
+; CHECK-NEXT:    add a7, a1, a4
 ; CHECK-NEXT:    srli a6, a6, 12
 ; CHECK-NEXT:    srli a7, a7, 12
 ; CHECK-NEXT:    bne a6, a7, .LBB0_7
 ; CHECK-NEXT:  .LBB0_3: # %mismatch_vec_loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    sub a4, a5, a2
-; CHECK-NEXT:    vsetvli a4, a4, e8, m2, ta, ma
+; CHECK-NEXT:    sub a5, a4, a2
 ; CHECK-NEXT:    add a6, a0, a2
+; CHECK-NEXT:    add a7, a1, a2
+; CHECK-NEXT:    vsetvli a5, a5, e8, m2, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a6)
-; CHECK-NEXT:    add a6, a1, a2
-; CHECK-NEXT:    vle8.v v10, (a6)
+; CHECK-NEXT:    vle8.v v10, (a7)
 ; CHECK-NEXT:    vmsne.vv v12, v8, v10
 ; CHECK-NEXT:    vfirst.m a7, v12
-; CHECK-NEXT:    mv a6, a4
+; CHECK-NEXT:    mv a6, a5
 ; CHECK-NEXT:    bltz a7, .LBB0_5
 ; CHECK-NEXT:  # %bb.4: # %mismatch_vec_loop
 ; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
@@ -42,30 +42,30 @@ define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 signext %len, i32 signext %
 ; CHECK-NEXT:  .LBB0_5: # %mismatch_vec_loop
 ; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    sext.w a7, a6
-; CHECK-NEXT:    bne a7, a4, .LBB0_11
+; CHECK-NEXT:    bne a7, a5, .LBB0_11
 ; CHECK-NEXT:  # %bb.6: # %mismatch_vec_loop_inc
 ; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    bne a2, a5, .LBB0_3
+; CHECK-NEXT:    add a2, a2, a5
+; CHECK-NEXT:    bne a2, a4, .LBB0_3
 ; CHECK-NEXT:    j .LBB0_9
 ; CHECK-NEXT:  .LBB0_7: # %mismatch_loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    slli a2, a4, 32
+; CHECK-NEXT:    slli a2, a5, 32
 ; CHECK-NEXT:    srli a2, a2, 32
-; CHECK-NEXT:    add a5, a0, a2
-; CHECK-NEXT:    lbu a5, 0(a5)
+; CHECK-NEXT:    add a4, a0, a2
 ; CHECK-NEXT:    add a2, a1, a2
+; CHECK-NEXT:    lbu a4, 0(a4)
 ; CHECK-NEXT:    lbu a2, 0(a2)
-; CHECK-NEXT:    bne a5, a2, .LBB0_10
+; CHECK-NEXT:    bne a4, a2, .LBB0_10
 ; CHECK-NEXT:  # %bb.8: # %mismatch_loop_inc
 ; CHECK-NEXT:    # in Loop: Header=BB0_7 Depth=1
-; CHECK-NEXT:    addiw a4, a4, 1
-; CHECK-NEXT:    bne a3, a4, .LBB0_7
+; CHECK-NEXT:    addiw a5, a5, 1
+; CHECK-NEXT:    bne a3, a5, .LBB0_7
 ; CHECK-NEXT:  .LBB0_9: # %while.end
 ; CHECK-NEXT:    mv a0, a3
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_10:
-; CHECK-NEXT:    mv a0, a4
+; CHECK-NEXT:    mv a0, a5
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_11: # %mismatch_vec_loop_found
 ; CHECK-NEXT:    slli a6, a6, 32

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
index 83f59f973d4659..fd518d9be786de 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
@@ -51,12 +51,12 @@ define <vscale x 1 x bfloat> @vfma_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v12, v9, v11, v0.t
+; CHECK-NEXT:    vfmadd.vv v12, v11, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
@@ -71,14 +71,14 @@ define <vscale x 1 x bfloat> @vfma_vf_nxv1bf16_commute(<vscale x 1 x bfloat> %va
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v9, v8, v11, v0.t
+; CHECK-NEXT:    vfmadd.vv v11, v8, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v11
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
@@ -91,12 +91,12 @@ define <vscale x 1 x bfloat> @vfma_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v12, v9, v11
+; CHECK-NEXT:    vfmadd.vv v12, v11, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
@@ -111,12 +111,12 @@ define <vscale x 1 x bfloat> @vfma_vf_nxv1bf16_unmasked_commute(<vscale x 1 x bf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v12, v9, v11
+; CHECK-NEXT:    vfmadd.vv v12, v11, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
@@ -165,12 +165,12 @@ define <vscale x 2 x bfloat> @vfma_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfmadd.vv v12, v9, v11, v0.t
+; CHECK-NEXT:    vfmadd.vv v12, v11, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
@@ -185,14 +185,14 @@ define <vscale x 2 x bfloat> @vfma_vf_nxv2bf16_commute(<vscale x 2 x bfloat> %va
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfmadd.vv v9, v8, v11, v0.t
+; CHECK-NEXT:    vfmadd.vv v11, v8, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v11
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
@@ -205,12 +205,12 @@ define <vscale x 2 x bfloat> @vfma_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfmadd.vv v12, v9, v11
+; CHECK-NEXT:    vfmadd.vv v12, v11, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
@@ -225,12 +225,12 @@ define <vscale x 2 x bfloat> @vfma_vf_nxv2bf16_unmasked_commute(<vscale x 2 x bf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vfmadd.vv v12, v9, v11
+; CHECK-NEXT:    vfmadd.vv v12, v11, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
@@ -279,14 +279,14 @@ define <vscale x 4 x bfloat> @vfma_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v14, v12, v0.t
+; CHECK-NEXT:    vfmadd.vv v14, v12, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v14
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
@@ -299,14 +299,14 @@ define <vscale x 4 x bfloat> @vfma_vf_nxv4bf16_commute(<vscale x 4 x bfloat> %va
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v14, v8, v12, v0.t
+; CHECK-NEXT:    vfmadd.vv v12, v14, v10, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v14
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
@@ -319,14 +319,14 @@ define <vscale x 4 x bfloat> @vfma_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v14, v12
+; CHECK-NEXT:    vfmadd.vv v14, v12, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v14
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
@@ -339,14 +339,14 @@ define <vscale x 4 x bfloat> @vfma_vf_nxv4bf16_unmasked_commute(<vscale x 4 x bf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v14, v12
+; CHECK-NEXT:    vfmadd.vv v14, v12, v10
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v14
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
@@ -393,14 +393,14 @@ define <vscale x 8 x bfloat> @vfma_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfmadd.vv v24, v20, v16, v0.t
+; CHECK-NEXT:    vfmadd.vv v20, v16, v12, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v20
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -413,14 +413,14 @@ define <vscale x 8 x bfloat> @vfma_vf_nxv8bf16_commute(<vscale x 8 x bfloat> %va
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfmadd.vv v20, v8, v16, v0.t
+; CHECK-NEXT:    vfmadd.vv v16, v20, v12, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v20
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -433,14 +433,14 @@ define <vscale x 8 x bfloat> @vfma_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfmadd.vv v24, v20, v16
+; CHECK-NEXT:    vfmadd.vv v20, v16, v12
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v20
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -453,14 +453,14 @@ define <vscale x 8 x bfloat> @vfma_vf_nxv8bf16_unmasked_commute(<vscale x 8 x bf
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfmadd.vv v24, v20, v16
+; CHECK-NEXT:    vfmadd.vv v20, v16, v12
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v20
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -528,12 +528,13 @@ define <vscale x 16 x bfloat> @vfma_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfl
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v4, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv.v.x v12, a1
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
@@ -557,14 +558,14 @@ define <vscale x 16 x bfloat> @vfma_vf_nxv16bf16_commute(<vscale x 16 x bfloat>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vmv.v.x v4, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v4
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; CHECK-NEXT:    vfmadd.vv v24, v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
@@ -575,31 +576,16 @@ define <vscale x 16 x bfloat> @vfma_vf_nxv16bf16_commute(<vscale x 16 x bfloat>
 define <vscale x 16 x bfloat> @vfma_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, bfloat %b, <vscale x 16 x bfloat> %vc, i32 zeroext %evl) {
 ; CHECK-LABEL: vfma_vf_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 2
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v0, v24
+; CHECK-NEXT:    vfmadd.vv v0, v24, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
@@ -610,31 +596,16 @@ define <vscale x 16 x bfloat> @vfma_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat>
 define <vscale x 16 x bfloat> @vfma_vf_nxv16bf16_unmasked_commute(<vscale x 16 x bfloat> %va, bfloat %b, <vscale x 16 x bfloat> %vc, i32 zeroext %evl) {
 ; CHECK-LABEL: vfma_vf_nxv16bf16_unmasked_commute:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 2
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vmv.v.x v12, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v0, v24
+; CHECK-NEXT:    vfmadd.vv v0, v24, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
@@ -661,11 +632,8 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 ; CHECK-NEXT:    vl8re16.v v0, (a0)
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a0, a2, 1
-; CHECK-NEXT:    sub a3, a1, a0
-; CHECK-NEXT:    sltu a4, a1, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    sub a3, a1, a0
 ; CHECK-NEXT:    csrr a4, vlenb
 ; CHECK-NEXT:    slli a4, a4, 4
 ; CHECK-NEXT:    add a4, sp, a4
@@ -673,48 +641,51 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 ; CHECK-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v24, v24, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs1r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    mv a4, a2
 ; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a4, a4, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vs1r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    sltu a2, a1, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    mv a4, a3
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a4, a4, a3
+; CHECK-NEXT:    slli a3, a3, 1
+; CHECK-NEXT:    add a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v8, v16
-; CHECK-NEXT:    vmv4r.v v24, v8
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a4, a2, 5
-; CHECK-NEXT:    add a2, a4, a2
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a4, a2, 4
-; CHECK-NEXT:    add a2, a4, a2
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv8r.v v24, v16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a4, a3, 5
+; CHECK-NEXT:    add a3, a4, a3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a4, a3, 4
+; CHECK-NEXT:    add a3, a4, a3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v4
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -806,8 +777,7 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    slli a2, a2, 5
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    vmv8r.v v24, v16
-; CHECK-NEXT:    vl8re16.v v16, (a0)
+; CHECK-NEXT:    vl8re16.v v24, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a2, a0
@@ -815,45 +785,47 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v7
 ; CHECK-NEXT:    slli a0, a2, 1
-; CHECK-NEXT:    sub a3, a1, a0
-; CHECK-NEXT:    sltu a4, a1, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v7
+; CHECK-NEXT:    sub a3, a1, a0
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v7, a2
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    sltu a2, a1, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv4r.v v8, v16
+; CHECK-NEXT:    vmv8r.v v24, v16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a4, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    mv a4, a3
+; CHECK-NEXT:    slli a3, a3, 1
+; CHECK-NEXT:    add a3, a3, a4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -914,58 +886,55 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfl
 ; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v24, a1
+; CHECK-NEXT:    fmv.x.h a2, fa0
+; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 5
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    slli a4, a1, 5
+; CHECK-NEXT:    add a1, a4, a1
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
-; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    slli a1, a3, 1
+; CHECK-NEXT:    srli a3, a3, 2
+; CHECK-NEXT:    sub a4, a0, a1
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a5, a5, 4
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vs1r.v v0, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
+; CHECK-NEXT:    sltu a3, a0, a4
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 4
+; CHECK-NEXT:    mv a5, a4
+; CHECK-NEXT:    slli a4, a4, 3
+; CHECK-NEXT:    add a5, a5, a4
+; CHECK-NEXT:    slli a4, a4, 1
+; CHECK-NEXT:    add a4, a4, a5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    mv a4, a2
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a4, a4, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vmv.v.x v24, a2
+; CHECK-NEXT:    vmv4r.v v8, v24
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a4, a2, 4
 ; CHECK-NEXT:    add a2, a4, a2
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a4, a2, 5
-; CHECK-NEXT:    add a2, a4, a2
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
@@ -983,11 +952,8 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfl
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a2, a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    slli a2, a1, 5
+; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
@@ -998,13 +964,6 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfl
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 4
-; CHECK-NEXT:    add a1, a2, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a2, a2, a1
@@ -1012,12 +971,19 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfl
 ; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a2, a1, 5
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a2, a1, 4
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v0
 ; CHECK-NEXT:    csrr a1, vlenb
@@ -1031,11 +997,8 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfl
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a2, a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    slli a2, a1, 5
+; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -1076,58 +1039,55 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_commute(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v24, a1
+; CHECK-NEXT:    fmv.x.h a2, fa0
+; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 5
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    slli a4, a1, 5
+; CHECK-NEXT:    add a1, a4, a1
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
-; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    slli a1, a3, 1
+; CHECK-NEXT:    srli a3, a3, 2
+; CHECK-NEXT:    sub a4, a0, a1
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a5, a5, 4
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vs1r.v v0, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
+; CHECK-NEXT:    sltu a3, a0, a4
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a4
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 4
+; CHECK-NEXT:    mv a5, a4
+; CHECK-NEXT:    slli a4, a4, 3
+; CHECK-NEXT:    add a5, a5, a4
+; CHECK-NEXT:    slli a4, a4, 1
+; CHECK-NEXT:    add a4, a4, a5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    mv a4, a2
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a4, a4, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vmv.v.x v24, a2
+; CHECK-NEXT:    vmv4r.v v8, v24
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a4, a2, 4
 ; CHECK-NEXT:    add a2, a4, a2
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a4, a2, 5
-; CHECK-NEXT:    add a2, a4, a2
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
@@ -1145,11 +1105,8 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_commute(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB33_2:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a2, a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    slli a2, a1, 5
+; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -1160,25 +1117,25 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_commute(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 4
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a2, a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 5
+; CHECK-NEXT:    slli a2, a1, 4
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a2, a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    slli a2, a1, 5
+; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -1193,11 +1150,8 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_commute(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a2, a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    slli a2, a1, 5
+; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
@@ -1234,50 +1188,46 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v24, a1
+; CHECK-NEXT:    fmv.x.h a2, fa0
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v7
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    mv a4, a1
 ; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    add a1, a1, a4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vmset.m v7
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a2
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    slli a1, a3, 1
+; CHECK-NEXT:    srli a3, a3, 2
+; CHECK-NEXT:    sub a4, a0, a1
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v7, a3
+; CHECK-NEXT:    sltu a3, a0, a4
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a4
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    slli a4, a4, 4
+; CHECK-NEXT:    add a4, sp, a4
+; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vmv.v.x v24, a2
+; CHECK-NEXT:    vmv4r.v v8, v24
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a4, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
 ; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -1290,7 +1240,10 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
@@ -1298,16 +1251,13 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
@@ -1341,50 +1291,46 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked_commute(<vscale x 32 x
 ; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v24, a1
+; CHECK-NEXT:    fmv.x.h a2, fa0
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v7
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    mv a4, a1
 ; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    add a1, a1, a4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vmset.m v7
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a2
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    slli a1, a3, 1
+; CHECK-NEXT:    srli a3, a3, 2
+; CHECK-NEXT:    sub a4, a0, a1
+; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v7, a3
+; CHECK-NEXT:    sltu a3, a0, a4
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a4
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    slli a4, a4, 4
+; CHECK-NEXT:    add a4, sp, a4
+; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vmv.v.x v24, a2
+; CHECK-NEXT:    vmv4r.v v8, v24
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    mv a4, a2
-; CHECK-NEXT:    slli a2, a2, 1
-; CHECK-NEXT:    add a2, a2, a4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
 ; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -1397,7 +1343,10 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked_commute(<vscale x 32 x
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
@@ -1405,16 +1354,13 @@ define <vscale x 32 x bfloat> @vfma_vf_nxv32bf16_unmasked_commute(<vscale x 32 x
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -1496,12 +1442,12 @@ define <vscale x 1 x half> @vfma_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <v
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -1522,14 +1468,14 @@ define <vscale x 1 x half> @vfma_vf_nxv1f16_commute(<vscale x 1 x half> %va, hal
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v11, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v11, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -1548,12 +1494,12 @@ define <vscale x 1 x half> @vfma_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, ha
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -1574,12 +1520,12 @@ define <vscale x 1 x half> @vfma_vf_nxv1f16_unmasked_commute(<vscale x 1 x half>
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -1647,12 +1593,12 @@ define <vscale x 2 x half> @vfma_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <v
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -1673,14 +1619,14 @@ define <vscale x 2 x half> @vfma_vf_nxv2f16_commute(<vscale x 2 x half> %va, hal
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v11, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v11, v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -1699,12 +1645,12 @@ define <vscale x 2 x half> @vfma_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, ha
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -1725,12 +1671,12 @@ define <vscale x 2 x half> @vfma_vf_nxv2f16_unmasked_commute(<vscale x 2 x half>
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -1798,14 +1744,14 @@ define <vscale x 4 x half> @vfma_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <v
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -1824,14 +1770,14 @@ define <vscale x 4 x half> @vfma_vf_nxv4f16_commute(<vscale x 4 x half> %va, hal
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v8, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v14, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -1850,14 +1796,14 @@ define <vscale x 4 x half> @vfma_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, ha
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -1876,14 +1822,14 @@ define <vscale x 4 x half> @vfma_vf_nxv4f16_unmasked_commute(<vscale x 4 x half>
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -1949,14 +1895,14 @@ define <vscale x 8 x half> @vfma_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <v
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -1975,14 +1921,14 @@ define <vscale x 8 x half> @vfma_vf_nxv8f16_commute(<vscale x 8 x half> %va, hal
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v20, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -2001,14 +1947,14 @@ define <vscale x 8 x half> @vfma_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, ha
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -2027,14 +1973,14 @@ define <vscale x 8 x half> @vfma_vf_nxv8f16_unmasked_commute(<vscale x 8 x half>
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vmv.v.x v10, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -2121,12 +2067,13 @@ define <vscale x 16 x half> @vfma_vf_nxv16f16(<vscale x 16 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
@@ -2156,14 +2103,14 @@ define <vscale x 16 x half> @vfma_vf_nxv16f16_commute(<vscale x 16 x half> %va,
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vmv.v.x v4, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -2180,31 +2127,16 @@ define <vscale x 16 x half> @vfma_vf_nxv16f16_unmasked(<vscale x 16 x half> %va,
 ;
 ; ZVFHMIN-LABEL: vfma_vf_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -2221,31 +2153,16 @@ define <vscale x 16 x half> @vfma_vf_nxv16f16_unmasked_commute(<vscale x 16 x ha
 ;
 ; ZVFHMIN-LABEL: vfma_vf_nxv16f16_unmasked_commute:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -2280,11 +2197,8 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vl8re16.v v0, (a0)
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a1, a0
-; ZVFHMIN-NEXT:    sltu a4, a1, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a1, a0
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
@@ -2292,48 +2206,51 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v24, v24, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs1r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    mv a4, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a4, a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vs1r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sltu a2, a1, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a4, a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    vmv4r.v v24, v8
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 5
-; ZVFHMIN-NEXT:    add a2, a4, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
+; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a4, a3, 5
+; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a4, a3, 4
+; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -2432,8 +2349,7 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
-; ZVFHMIN-NEXT:    vl8re16.v v16, (a0)
+; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a2, a0
@@ -2441,45 +2357,47 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    add a0, a0, a2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
 ; ZVFHMIN-NEXT:    slli a0, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a1, a0
-; ZVFHMIN-NEXT:    sltu a4, a1, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    sub a3, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sltu a2, a1, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
+; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -2546,58 +2464,55 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16(<vscale x 32 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 5
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    slli a4, a1, 5
+; ZVFHMIN-NEXT:    add a1, a4, a1
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a0, a1
+; ZVFHMIN-NEXT:    csrr a5, vlenb
+; ZVFHMIN-NEXT:    slli a5, a5, 4
+; ZVFHMIN-NEXT:    add a5, sp, a5
+; ZVFHMIN-NEXT:    addi a5, a5, 16
+; ZVFHMIN-NEXT:    vs1r.v v0, (a5) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a5, a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a4, a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a2
+; ZVFHMIN-NEXT:    vmv4r.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a2, 4
 ; ZVFHMIN-NEXT:    add a2, a4, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 5
-; ZVFHMIN-NEXT:    add a2, a4, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
@@ -2615,11 +2530,8 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16(<vscale x 32 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB68_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a2, a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a2, a1, 5
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
@@ -2630,13 +2542,6 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16(<vscale x 32 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 4
-; ZVFHMIN-NEXT:    add a1, a2, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    mv a2, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a2, a2, a1
@@ -2644,12 +2549,19 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16(<vscale x 32 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a1, 5
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a2, a1, 4
+; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
@@ -2663,11 +2575,8 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16(<vscale x 32 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a2, a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a2, a1, 5
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -2714,58 +2623,55 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_commute(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 5
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    slli a4, a1, 5
+; ZVFHMIN-NEXT:    add a1, a4, a1
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a0, a1
+; ZVFHMIN-NEXT:    csrr a5, vlenb
+; ZVFHMIN-NEXT:    slli a5, a5, 4
+; ZVFHMIN-NEXT:    add a5, sp, a5
+; ZVFHMIN-NEXT:    addi a5, a5, 16
+; ZVFHMIN-NEXT:    vs1r.v v0, (a5) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a5, a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a4, a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a2
+; ZVFHMIN-NEXT:    vmv4r.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a2, 4
 ; ZVFHMIN-NEXT:    add a2, a4, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 5
-; ZVFHMIN-NEXT:    add a2, a4, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
@@ -2783,11 +2689,8 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_commute(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB69_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a2, a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a2, a1, 5
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -2798,25 +2701,25 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_commute(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 4
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a2, a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 5
+; ZVFHMIN-NEXT:    slli a2, a1, 4
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a2, a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a2, a1, 5
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -2831,11 +2734,8 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_commute(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a2, a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a2, a1, 5
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
@@ -2878,50 +2778,46 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    mv a4, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, a1, a4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a2
+; ZVFHMIN-NEXT:    vmv4r.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -2934,7 +2830,10 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB70_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
@@ -2942,16 +2841,13 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
@@ -2991,50 +2887,46 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked_commute(<vscale x 32 x ha
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    mv a4, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, a1, a4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a2
+; ZVFHMIN-NEXT:    vmv4r.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
@@ -3047,7 +2939,10 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked_commute(<vscale x 32 x ha
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB71_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
@@ -3055,16 +2950,13 @@ define <vscale x 32 x half> @vfma_vf_nxv32f16_unmasked_commute(<vscale x 32 x ha
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -3764,12 +3656,12 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:    slli a1, a1, 2
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
@@ -3787,27 +3679,35 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a3, a1, 3
 ; CHECK-NEXT:    slli a5, a1, 3
-; CHECK-NEXT:    add a6, a2, a5
-; CHECK-NEXT:    vl8re64.v v8, (a6)
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 4
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
-; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    sub a6, a4, a1
+; CHECK-NEXT:    add a7, a2, a5
+; CHECK-NEXT:    add a5, a0, a5
+; CHECK-NEXT:    vl8re64.v v8, (a7)
+; CHECK-NEXT:    csrr a7, vlenb
+; CHECK-NEXT:    slli a7, a7, 3
+; CHECK-NEXT:    add a7, sp, a7
+; CHECK-NEXT:    addi a7, a7, 16
+; CHECK-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
 ; CHECK-NEXT:    sltu a7, a4, a6
 ; CHECK-NEXT:    addi a7, a7, -1
 ; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    vl8re64.v v16, (a2)
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a5, a5, 3
+; CHECK-NEXT:    mv t0, a5
+; CHECK-NEXT:    slli a5, a5, 2
+; CHECK-NEXT:    add a5, a5, t0
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v8, (a2)
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v16, (a0)
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    and a0, a7, a6
@@ -3820,16 +3720,24 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    slli a2, a2, 2
+; CHECK-NEXT:    add a2, a2, a3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a2, a0
-; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
@@ -3845,27 +3753,26 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
@@ -3882,62 +3789,73 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a3
+; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    mv a3, a1
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    add a5, a2, a3
-; CHECK-NEXT:    vl8re64.v v24, (a5)
+; CHECK-NEXT:    vl8re64.v v8, (a5)
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a5, a5, 3
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    sub a5, a4, a1
 ; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    vl8re64.v v16, (a3)
-; CHECK-NEXT:    sub a3, a4, a1
-; CHECK-NEXT:    sltu a5, a4, a3
+; CHECK-NEXT:    vl8re64.v v24, (a3)
+; CHECK-NEXT:    sltu a3, a4, a5
 ; CHECK-NEXT:    vl8re64.v v8, (a2)
 ; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8re64.v v0, (a0)
-; CHECK-NEXT:    addi a5, a5, -1
-; CHECK-NEXT:    and a3, a5, a3
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a5
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v8, v24
+; CHECK-NEXT:    vfmadd.vv v24, v16, v8
 ; CHECK-NEXT:    bltu a4, a1, .LBB129_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a4, a1
 ; CHECK-NEXT:  .LBB129_2:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v0, v24, v8
+; CHECK-NEXT:    vfmadd.vv v0, v16, v8
 ; CHECK-NEXT:    vmv.v.v v8, v0
+; CHECK-NEXT:    vmv8r.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -3960,14 +3878,15 @@ define <vscale x 1 x half> @vfmsub_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv1f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v11, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -3986,14 +3905,15 @@ define <vscale x 1 x half> @vfmsub_vv_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv1f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -4015,14 +3935,14 @@ define <vscale x 1 x half> @vfmsub_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -4046,16 +3966,16 @@ define <vscale x 1 x half> @vfmsub_vf_nxv1f16_commute(<vscale x 1 x half> %va, h
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v11, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v11, v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -4077,14 +3997,14 @@ define <vscale x 1 x half> @vfmsub_vf_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v9
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -4108,14 +4028,14 @@ define <vscale x 1 x half> @vfmsub_vf_nxv1f16_unmasked_commute(<vscale x 1 x hal
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v9
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -4451,8 +4371,8 @@ define <vscale x 1 x half> @vfnmadd_vf_nxv1f16_neg_splat_unmasked(<vscale x 1 x
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
@@ -4484,8 +4404,8 @@ define <vscale x 1 x half> @vfnmadd_vf_nxv1f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
@@ -4629,16 +4549,16 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16(<vscale x 1 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -4660,16 +4580,16 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16_commute(<vscale x 1 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v11, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -4691,16 +4611,16 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16_unmasked(<vscale x 1 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -4722,16 +4642,16 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16_unmasked_commute(<vscale x 1 x ha
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -4753,16 +4673,16 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16_neg_splat(<vscale x 1 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v11, v9, v10, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -4784,14 +4704,14 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16_neg_splat_commute(<vscale x 1 x h
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -4815,14 +4735,14 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16_neg_splat_unmasked(<vscale x 1 x
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v10
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -4846,14 +4766,14 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v10
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -4877,14 +4797,15 @@ define <vscale x 2 x half> @vfmsub_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv2f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v11, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -4903,14 +4824,15 @@ define <vscale x 2 x half> @vfmsub_vv_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv2f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -4932,14 +4854,14 @@ define <vscale x 2 x half> @vfmsub_vf_nxv2f16(<vscale x 2 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -4963,16 +4885,16 @@ define <vscale x 2 x half> @vfmsub_vf_nxv2f16_commute(<vscale x 2 x half> %va, h
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v11, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v11, v8, v9, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -4994,14 +4916,14 @@ define <vscale x 2 x half> @vfmsub_vf_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v9
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -5025,14 +4947,14 @@ define <vscale x 2 x half> @vfmsub_vf_nxv2f16_unmasked_commute(<vscale x 2 x hal
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v9
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -5368,8 +5290,8 @@ define <vscale x 2 x half> @vfnmadd_vf_nxv2f16_neg_splat_unmasked(<vscale x 2 x
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
@@ -5401,8 +5323,8 @@ define <vscale x 2 x half> @vfnmadd_vf_nxv2f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
@@ -5546,16 +5468,16 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16(<vscale x 2 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -5577,16 +5499,16 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16_commute(<vscale x 2 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v11, v9, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -5608,16 +5530,16 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16_unmasked(<vscale x 2 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -5639,16 +5561,16 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16_unmasked_commute(<vscale x 2 x ha
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -5670,16 +5592,16 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16_neg_splat(<vscale x 2 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v11, v9, v10, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v11
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -5701,14 +5623,14 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16_neg_splat_commute(<vscale x 2 x h
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v10, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -5732,14 +5654,14 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16_neg_splat_unmasked(<vscale x 2 x
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v10
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -5763,14 +5685,14 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v10
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -5794,14 +5716,15 @@ define <vscale x 4 x half> @vfmsub_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv4f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
@@ -5820,14 +5743,15 @@ define <vscale x 4 x half> @vfmsub_vv_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv4f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
@@ -5849,14 +5773,14 @@ define <vscale x 4 x half> @vfmsub_vf_nxv4f16(<vscale x 4 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v14, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
@@ -5880,16 +5804,16 @@ define <vscale x 4 x half> @vfmsub_vf_nxv4f16_commute(<vscale x 4 x half> %va, h
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v8, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v8, v14, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -5911,14 +5835,14 @@ define <vscale x 4 x half> @vfmsub_vf_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v14
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
@@ -5942,14 +5866,14 @@ define <vscale x 4 x half> @vfmsub_vf_nxv4f16_unmasked_commute(<vscale x 4 x hal
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v14
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
@@ -6285,8 +6209,8 @@ define <vscale x 4 x half> @vfnmadd_vf_nxv4f16_neg_splat_unmasked(<vscale x 4 x
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10
@@ -6318,8 +6242,8 @@ define <vscale x 4 x half> @vfnmadd_vf_nxv4f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a1
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10
@@ -6463,14 +6387,14 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16(<vscale x 4 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v14, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
@@ -6494,16 +6418,16 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_commute(<vscale x 4 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v8, v14, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v8, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -6525,14 +6449,14 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_unmasked(<vscale x 4 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v14
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
@@ -6556,14 +6480,14 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_unmasked_commute(<vscale x 4 x ha
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v14
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
@@ -6587,16 +6511,16 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_neg_splat(<vscale x 4 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v14, v10, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v10, v14, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -6618,14 +6542,14 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_neg_splat_commute(<vscale x 4 x h
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
@@ -6649,14 +6573,14 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_neg_splat_unmasked(<vscale x 4 x
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
@@ -6680,14 +6604,14 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
@@ -6711,14 +6635,15 @@ define <vscale x 8 x half> @vfmsub_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv8f16:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v12, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
@@ -6737,14 +6662,15 @@ define <vscale x 8 x half> @vfmsub_vv_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv8f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v12, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
@@ -6766,14 +6692,14 @@ define <vscale x 8 x half> @vfmsub_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
@@ -6797,16 +6723,16 @@ define <vscale x 8 x half> @vfmsub_vf_nxv8f16_commute(<vscale x 8 x half> %va, h
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v20, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -6828,14 +6754,14 @@ define <vscale x 8 x half> @vfmsub_vf_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v20
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
@@ -6859,14 +6785,14 @@ define <vscale x 8 x half> @vfmsub_vf_nxv8f16_unmasked_commute(<vscale x 8 x hal
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v10, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v20
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
@@ -7202,8 +7128,8 @@ define <vscale x 8 x half> @vfnmadd_vf_nxv8f16_neg_splat_unmasked(<vscale x 8 x
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12
@@ -7235,8 +7161,8 @@ define <vscale x 8 x half> @vfnmadd_vf_nxv8f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
 ; ZVFHMIN-NEXT:    vxor.vx v10, v10, a1
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12
@@ -7380,14 +7306,14 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16(<vscale x 8 x half> %va, half %b,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v20, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
@@ -7411,16 +7337,16 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_commute(<vscale x 8 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v20, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -7442,14 +7368,14 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_unmasked(<vscale x 8 x half> %va,
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v20
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
@@ -7473,14 +7399,14 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_unmasked_commute(<vscale x 8 x ha
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v20
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
@@ -7504,16 +7430,16 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_neg_splat(<vscale x 8 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v10, v12, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v20, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v20, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -7535,14 +7461,14 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_neg_splat_commute(<vscale x 8 x h
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v10, v12, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
@@ -7566,14 +7492,14 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_neg_splat_unmasked(<vscale x 8 x
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
+; ZVFHMIN-NEXT:    vxor.vx v10, v12, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
@@ -7597,14 +7523,14 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16_neg_splat_unmasked_commute(<vscal
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
+; ZVFHMIN-NEXT:    vxor.vx v10, v12, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
@@ -7627,19 +7553,34 @@ define <vscale x 16 x half> @vfmsub_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
-; ZVFHMIN-NEXT:    vmv4r.v v20, v8
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v16, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %negc = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %c, <vscale x 16 x i1> %m, i32 %evl)
   %v = call <vscale x 16 x half> @llvm.vp.fma.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x half> %negc, <vscale x 16 x i1> %m, i32 %evl)
@@ -7656,14 +7597,15 @@ define <vscale x 16 x half> @vfmsub_vv_nxv16f16_unmasked(<vscale x 16 x half> %v
 ; ZVFHMIN-LABEL: vfmsub_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v16, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
@@ -7681,21 +7623,36 @@ define <vscale x 16 x half> @vfmsub_vf_nxv16f16(<vscale x 16 x half> %va, half %
 ;
 ; ZVFHMIN-LABEL: vfmsub_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv4r.v v16, v8
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v12, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -7717,11 +7674,11 @@ define <vscale x 16 x half> @vfmsub_vf_nxv16f16_commute(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v8, v12, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
@@ -7756,16 +7713,16 @@ define <vscale x 16 x half> @vfmsub_vf_nxv16f16_unmasked(<vscale x 16 x half> %v
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v12, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
@@ -7803,16 +7760,16 @@ define <vscale x 16 x half> @vfmsub_vf_nxv16f16_unmasked_commute(<vscale x 16 x
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v12, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
@@ -8234,8 +8191,8 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16_neg_splat_unmasked(<vscale x 16
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
@@ -8267,8 +8224,8 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
@@ -8424,21 +8381,36 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16(<vscale x 16 x half> %va, half
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv4r.v v16, v12
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -8460,16 +8432,16 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_commute(<vscale x 16 x half> %v
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -8499,16 +8471,16 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_unmasked(<vscale x 16 x half> %
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
@@ -8546,16 +8518,16 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_unmasked_commute(<vscale x 16 x
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vs4r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
@@ -8586,16 +8558,16 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_neg_splat(<vscale x 16 x half>
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v12, v16, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -8613,22 +8585,36 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_neg_splat_commute(<vscale x 16
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vf_nxv16f16_neg_splat_commute:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv4r.v v20, v12
-; ZVFHMIN-NEXT:    vmv4r.v v4, v8
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v12, v16, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -8650,14 +8636,14 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_neg_splat_unmasked(<vscale x 16
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
+; ZVFHMIN-NEXT:    vxor.vx v12, v16, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
@@ -8681,14 +8667,14 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
+; ZVFHMIN-NEXT:    vxor.vx v12, v16, a1
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
@@ -8715,147 +8701,131 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    mv a3, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a3, a3, a2
+; ZVFHMIN-NEXT:    mv a3, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 2
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a2, a0, 5
-; ZVFHMIN-NEXT:    add a0, a2, a0
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v24, a0, v0.t
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v24, v8
+; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
+; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2, v0.t
 ; ZVFHMIN-NEXT:    slli a2, a0, 1
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a4, a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 4
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    mv a3, a1
+; ZVFHMIN-NEXT:    vmv4r.v v4, v12
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a1, a2, .LBB280_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a3, a2
 ; ZVFHMIN-NEXT:  .LBB280_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a4, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a5, a4, 5
-; ZVFHMIN-NEXT:    add a4, a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a5, a4, 4
-; ZVFHMIN-NEXT:    add a4, a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    mv a4, a3
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a4, a4, a3
+; ZVFHMIN-NEXT:    mv a4, a3
 ; ZVFHMIN-NEXT:    slli a3, a3, 1
 ; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sub a2, a1, a2
+; ZVFHMIN-NEXT:    srli a0, a0, 2
+; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 4
-; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sub a2, a1, a2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 5
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    sltu a1, a1, a2
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a0
 ; ZVFHMIN-NEXT:    addi a1, a1, -1
 ; ZVFHMIN-NEXT:    and a1, a1, a2
-; ZVFHMIN-NEXT:    srli a0, a0, 2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a2, a2, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a2, a0, 4
-; ZVFHMIN-NEXT:    add a0, a2, a0
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a2, a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a1, a1, a0
+; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -8887,31 +8857,38 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v0, v24, a0
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a0, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v20, v7, a3
+; ZVFHMIN-NEXT:    sltu a3, a1, a4
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v0, v24, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a0, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a1, a0
-; ZVFHMIN-NEXT:    sltu a4, a1, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
@@ -8921,68 +8898,66 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vmv1r.v v0, v20
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a1, a0, .LBB281_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB281_2:
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v0, v24
-; ZVFHMIN-NEXT:    vmv8r.v v8, v0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
+; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -9010,44 +8985,56 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1, v0.t
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a3, 8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a2
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v24, v16, a3, v0.t
 ; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    mv a3, a0
+; ZVFHMIN-NEXT:    vmv4r.v v4, v28
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a0, a2, .LBB282_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a3, a2
 ; ZVFHMIN-NEXT:  .LBB282_2:
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    addi a4, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    mv a5, a4
@@ -9055,43 +9042,50 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sub a2, a0, a2
+; ZVFHMIN-NEXT:    srli a1, a1, 2
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 5
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    sltu a0, a0, a2
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFHMIN-NEXT:    addi a0, a0, -1
 ; ZVFHMIN-NEXT:    and a0, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
@@ -9100,26 +9094,26 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -9147,35 +9141,35 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_commute(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a3, 8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a2
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1, v0.t
-; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a3, v0.t
 ; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    mv a3, a0
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a0, a2, .LBB283_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -9192,7 +9186,7 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_commute(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
@@ -9201,47 +9195,46 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_commute(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sub a2, a0, a2
+; ZVFHMIN-NEXT:    srli a1, a1, 2
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sub a2, a0, a2
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a0, a0, a2
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFHMIN-NEXT:    addi a0, a0, -1
 ; ZVFHMIN-NEXT:    and a0, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -9278,49 +9271,52 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
+; ZVFHMIN-NEXT:    vxor.vx v16, v24, a1
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vmv4r.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a4, a2
@@ -9328,9 +9324,14 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
@@ -9338,12 +9339,12 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -9352,11 +9353,11 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB284_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
@@ -9364,20 +9365,23 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v0
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v8, v16
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v0
+; ZVFHMIN-NEXT:    vmv8r.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -9402,88 +9406,91 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a1
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a2
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB285_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB285_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
@@ -9491,19 +9498,27 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v0
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -9533,30 +9548,31 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    mv a3, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a0, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v24, v24, a0, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2, v0.t
 ; ZVFHMIN-NEXT:    slli a2, a0, 1
-; ZVFHMIN-NEXT:    mv a3, a1
 ; ZVFHMIN-NEXT:    vmv4r.v v4, v12
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vmv4r.v v12, v28
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
@@ -9586,47 +9602,46 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sub a2, a1, a2
+; ZVFHMIN-NEXT:    srli a0, a0, 2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    sltu a1, a1, a2
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a0
 ; ZVFHMIN-NEXT:    addi a1, a1, -1
 ; ZVFHMIN-NEXT:    and a1, a1, a2
-; ZVFHMIN-NEXT:    srli a0, a0, 2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a0
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -9659,51 +9674,48 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    mv a3, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a0, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v24, a0, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2, v0.t
 ; ZVFHMIN-NEXT:    slli a2, a0, 1
-; ZVFHMIN-NEXT:    mv a3, a1
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
+; ZVFHMIN-NEXT:    vmv4r.v v4, v20
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a4, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    bltu a1, a2, .LBB287_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:  .LBB287_2:
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    bltu a1, a2, .LBB287_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:  .LBB287_2:
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    mv a5, a4
@@ -9711,18 +9723,21 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    mv a4, a3
@@ -9730,32 +9745,28 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sub a2, a1, a2
+; ZVFHMIN-NEXT:    srli a0, a0, 2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    slli a3, a3, 5
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sub a2, a1, a2
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    sltu a1, a1, a2
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a0
 ; ZVFHMIN-NEXT:    addi a1, a1, -1
 ; ZVFHMIN-NEXT:    and a1, a1, a2
-; ZVFHMIN-NEXT:    srli a0, a0, 2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a0
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a2, a0
@@ -9765,17 +9776,20 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_commuted(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -9805,106 +9819,157 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a3, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 2
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a2, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a4, a0, 5
+; ZVFHMIN-NEXT:    add a0, a4, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    slli a0, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v16, v7, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs1r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sltu a3, a1, a4
+; ZVFHMIN-NEXT:    csrr a5, vlenb
+; ZVFHMIN-NEXT:    mv a6, a5
+; ZVFHMIN-NEXT:    slli a5, a5, 3
+; ZVFHMIN-NEXT:    add a6, a6, a5
+; ZVFHMIN-NEXT:    slli a5, a5, 1
+; ZVFHMIN-NEXT:    add a5, a5, a6
+; ZVFHMIN-NEXT:    add a5, sp, a5
+; ZVFHMIN-NEXT:    addi a5, a5, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v0, v16, a2
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a4, a4, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a4, a2, 4
+; ZVFHMIN-NEXT:    add a2, a4, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v24, v24, a0
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a0, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a1, a0
-; ZVFHMIN-NEXT:    sltu a4, a1, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    slli a4, a2, 5
+; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a4, a2, 4
+; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a3, a2, 5
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a3, a2, 5
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a1, a0, .LBB288_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB288_2:
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a2, a2, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a2, a0, 5
+; ZVFHMIN-NEXT:    add a0, a2, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v8, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v0
-; ZVFHMIN-NEXT:    vmv8r.v v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a1, a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -9929,79 +9994,109 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a3, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 2
 ; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sub sp, sp, a2
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v8
+; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vxor.vx v0, v0, a2
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a4, a0, 5
+; ZVFHMIN-NEXT:    add a0, a4, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    slli a0, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v8, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs1r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sltu a3, a1, a4
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v24, a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a0, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a1, a0
-; ZVFHMIN-NEXT:    sltu a4, a1, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vxor.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a4, a4, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 1
 ; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a4, a2, 5
+; ZVFHMIN-NEXT:    add a2, a4, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a4, a2, 4
+; ZVFHMIN-NEXT:    add a2, a4, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a4, a2, 4
+; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a3, a2, 5
+; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a3, a2, 4
+; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    bltu a1, a0, .LBB289_2
@@ -10009,8 +10104,9 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB289_2:
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a2, a2, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
@@ -10018,7 +10114,8 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a2, a0, 4
+; ZVFHMIN-NEXT:    add a0, a2, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
@@ -10028,7 +10125,11 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v8
 ; ZVFHMIN-NEXT:    vmv8r.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a1, a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -10058,21 +10159,21 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a4, 8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a2
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    mv a3, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1, v0.t
-; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a4, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a4, v0.t
 ; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    mv a3, a0
 ; ZVFHMIN-NEXT:    addi a4, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
@@ -10086,12 +10187,12 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv4r.v v4, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a0, a2, .LBB290_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a3, a2
@@ -10141,19 +10242,18 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sub a2, a0, a2
+; ZVFHMIN-NEXT:    srli a1, a1, 2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 5
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    sltu a0, a0, a2
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFHMIN-NEXT:    addi a0, a0, -1
 ; ZVFHMIN-NEXT:    and a0, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
@@ -10210,51 +10310,59 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_commute(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a4, 8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a2
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    mv a3, a0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1, v0.t
-; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a4, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v24, v16, a4, v0.t
 ; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    mv a3, a0
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vmv4r.v v20, v28
+; ZVFHMIN-NEXT:    addi a4, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv4r.v v4, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a0, a2, .LBB291_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a3, a2
 ; ZVFHMIN-NEXT:  .LBB291_2:
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    mv a5, a4
 ; ZVFHMIN-NEXT:    slli a4, a4, 1
 ; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    add a4, sp, a4
@@ -10262,56 +10370,55 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_commute(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    sub a2, a0, a2
+; ZVFHMIN-NEXT:    srli a1, a1, 2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a0, a0, a2
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFHMIN-NEXT:    addi a0, a0, -1
 ; ZVFHMIN-NEXT:    and a0, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -10347,73 +10454,71 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    mv a4, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, a1, a4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a2, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
@@ -10474,80 +10579,77 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    mv a4, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, a1, a4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v24, v8, a1
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a2, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v24, v8, a2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v24
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB293_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
@@ -10560,17 +10662,16 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v0
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v16
-; ZVFHMIN-NEXT:    vmv8r.v v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -10609,25 +10710,25 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1, v0.t
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a4, 8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 1
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a2
 ; ZVFHMIN-NEXT:    mv a3, a0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a4, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a4, v0.t
+; ZVFHMIN-NEXT:    slli a2, a1, 1
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
@@ -10646,61 +10747,66 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
-; ZVFHMIN-NEXT:    addi a4, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a4, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    sub a2, a0, a2
+; ZVFHMIN-NEXT:    srli a1, a1, 2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a0, a0, a2
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFHMIN-NEXT:    addi a0, a0, -1
 ; ZVFHMIN-NEXT:    and a0, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -10728,40 +10834,43 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a1, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1, v0.t
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a4, 8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 1
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a2
 ; ZVFHMIN-NEXT:    mv a3, a0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a4, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v24, v16, a4, v0.t
+; ZVFHMIN-NEXT:    slli a2, a1, 1
 ; ZVFHMIN-NEXT:    addi a4, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv4r.v v4, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
@@ -10770,6 +10879,12 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    mv a3, a2
 ; ZVFHMIN-NEXT:  .LBB295_2:
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    mv a5, a4
 ; ZVFHMIN-NEXT:    slli a4, a4, 1
@@ -10777,41 +10892,49 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sub a2, a0, a2
+; ZVFHMIN-NEXT:    srli a1, a1, 2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 5
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    sltu a0, a0, a2
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFHMIN-NEXT:    addi a0, a0, -1
 ; ZVFHMIN-NEXT:    and a0, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
@@ -10820,25 +10943,26 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -10864,78 +10988,103 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a2, a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vmv.v.x v0, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v24, v24, a1
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
-; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a4, a1, 5
+; ZVFHMIN-NEXT:    add a1, a4, a1
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v24, v8, a2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    vxor.vx v0, v0, a2
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v24, v24, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs1r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a4, a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a4, a3, 4
+; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a4, a3, 5
+; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vmv1r.v v0, v24
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a4, a3, 4
+; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a3, a2, 5
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a3, a2, 4
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
@@ -10947,8 +11096,9 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB296_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a2, a2, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
@@ -10956,7 +11106,8 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a2, a1, 4
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
@@ -10966,7 +11117,11 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v16
 ; ZVFHMIN-NEXT:    vmv8r.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a1, a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -10992,77 +11147,105 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a2, a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v8
+; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a1
+; ZVFHMIN-NEXT:    vxor.vx v0, v0, a2
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a4, a1, 5
+; ZVFHMIN-NEXT:    add a1, a4, a1
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    vxor.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sub a2, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v8, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs1r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a4, a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a4, a3, 5
+; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a4, a3, 4
+; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a4, a3, 4
+; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a3, a2, 5
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a3, a2, 4
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
@@ -11074,8 +11257,9 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB297_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a2, a2, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
@@ -11083,7 +11267,8 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a2, a1, 4
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
@@ -11093,7 +11278,11 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v8
 ; ZVFHMIN-NEXT:    vmv8r.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a1, a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -11124,30 +11313,31 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    mv a3, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a0, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v24, v24, a0, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    vxor.vx v8, v16, a2, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2, v0.t
 ; ZVFHMIN-NEXT:    slli a2, a0, 1
-; ZVFHMIN-NEXT:    mv a3, a1
 ; ZVFHMIN-NEXT:    vmv4r.v v4, v12
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vmv4r.v v12, v28
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
@@ -11177,47 +11367,46 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sub a2, a1, a2
+; ZVFHMIN-NEXT:    srli a0, a0, 2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    sltu a1, a1, a2
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a0
 ; ZVFHMIN-NEXT:    addi a1, a1, -1
 ; ZVFHMIN-NEXT:    and a1, a1, a2
-; ZVFHMIN-NEXT:    srli a0, a0, 2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a0
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -11250,40 +11439,37 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    mv a3, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a0, v0.t
-; ZVFHMIN-NEXT:    vxor.vx v16, v24, a0, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2, v0.t
 ; ZVFHMIN-NEXT:    slli a2, a0, 1
-; ZVFHMIN-NEXT:    mv a3, a1
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
+; ZVFHMIN-NEXT:    vmv4r.v v4, v20
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a4, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a1, a2, .LBB299_2
 ; ZVFHMIN-NEXT:  # %bb.1:
@@ -11293,8 +11479,8 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    slli a4, a4, 5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    mv a5, a4
@@ -11302,18 +11488,21 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    mv a4, a3
@@ -11321,32 +11510,28 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sub a2, a1, a2
+; ZVFHMIN-NEXT:    srli a0, a0, 2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    slli a3, a3, 5
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sub a2, a1, a2
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    sltu a1, a1, a2
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a0
 ; ZVFHMIN-NEXT:    addi a1, a1, -1
 ; ZVFHMIN-NEXT:    and a1, a1, a2
-; ZVFHMIN-NEXT:    srli a0, a0, 2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a0
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a2, a0
@@ -11356,17 +11541,20 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_commuted(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
@@ -11396,106 +11584,157 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a3, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 2
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a2, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a4, a0, 5
+; ZVFHMIN-NEXT:    add a0, a4, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    slli a0, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v16, v7, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs1r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sltu a3, a1, a4
+; ZVFHMIN-NEXT:    csrr a5, vlenb
+; ZVFHMIN-NEXT:    mv a6, a5
+; ZVFHMIN-NEXT:    slli a5, a5, 3
+; ZVFHMIN-NEXT:    add a6, a6, a5
+; ZVFHMIN-NEXT:    slli a5, a5, 1
+; ZVFHMIN-NEXT:    add a5, a5, a6
+; ZVFHMIN-NEXT:    add a5, sp, a5
+; ZVFHMIN-NEXT:    addi a5, a5, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v0, v16, a2
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a4, a4, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a4, a2, 4
+; ZVFHMIN-NEXT:    add a2, a4, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v24, v24, a0
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a0, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a1, a0
-; ZVFHMIN-NEXT:    sltu a4, a1, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    slli a4, a2, 5
+; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a4, a2, 4
+; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a3, a2, 5
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a3, a2, 5
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a1, a0, .LBB300_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB300_2:
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a2, a2, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a2, a0, 5
+; ZVFHMIN-NEXT:    add a0, a2, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v8, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v0
-; ZVFHMIN-NEXT:    vmv8r.v v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a1, a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -11520,79 +11759,109 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a3, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 2
 ; ZVFHMIN-NEXT:    add a2, a2, a3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sub sp, sp, a2
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v8
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v0, v0, a2
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a4, a0, 5
+; ZVFHMIN-NEXT:    add a0, a4, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    slli a0, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a1, a0
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v8, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs1r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sltu a3, a1, a4
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v16, a0
+; ZVFHMIN-NEXT:    vxor.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v24, a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a0, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a1, a0
-; ZVFHMIN-NEXT:    sltu a4, a1, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a4, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a4, a2, 5
+; ZVFHMIN-NEXT:    add a2, a4, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
+; ZVFHMIN-NEXT:    slli a4, a2, 4
+; ZVFHMIN-NEXT:    add a2, a4, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a4, a2, 4
+; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a3, a2, 5
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a3, a2, 4
+; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    bltu a1, a0, .LBB301_2
@@ -11600,8 +11869,9 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB301_2:
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a2, a2, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
@@ -11609,7 +11879,8 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a2, a0, 4
+; ZVFHMIN-NEXT:    add a0, a2, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
@@ -11619,7 +11890,11 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v8
 ; ZVFHMIN-NEXT:    vmv8r.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a1, a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -11649,32 +11924,25 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a3, 8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a2
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    vxor.vx v16, v8, a3, v0.t
 ; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    mv a3, a0
-; ZVFHMIN-NEXT:    bltu a0, a2, .LBB302_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:  .LBB302_2:
+; ZVFHMIN-NEXT:    vmv4r.v v12, v20
+; ZVFHMIN-NEXT:    addi a4, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
@@ -11683,36 +11951,75 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    bltu a0, a2, .LBB302_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:  .LBB302_2:
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v4, v28
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sub a2, a0, a2
+; ZVFHMIN-NEXT:    srli a1, a1, 2
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sub a2, a0, a2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 5
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    sltu a0, a0, a2
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFHMIN-NEXT:    addi a0, a0, -1
 ; ZVFHMIN-NEXT:    and a0, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
@@ -11720,33 +12027,18 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
@@ -11783,46 +12075,42 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_commute(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a3, 8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a2
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
-; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    vxor.vx v8, v8, a3, v0.t
 ; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    mv a3, a0
-; ZVFHMIN-NEXT:    bltu a0, a2, .LBB303_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:  .LBB303_2:
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    bltu a0, a2, .LBB303_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:  .LBB303_2:
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v4, v28
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    mv a5, a4
@@ -11841,41 +12129,50 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_commute(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    sub a2, a0, a2
+; ZVFHMIN-NEXT:    srli a1, a1, 2
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a0, a0, a2
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFHMIN-NEXT:    addi a0, a0, -1
 ; ZVFHMIN-NEXT:    and a0, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
@@ -11909,50 +12206,51 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
 ; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
+; ZVFHMIN-NEXT:    vxor.vx v16, v8, a1
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v8, v16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vmv4r.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a4, a2
@@ -11960,22 +12258,27 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -11984,11 +12287,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB304_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
@@ -11996,20 +12299,23 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v0, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v0
+; ZVFHMIN-NEXT:    vmv8r.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -12034,50 +12340,46 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
 ; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v16, v8, a1
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    sub a4, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a4
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a3, a3, a4
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vmv4r.v v8, v16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v16, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a4, a2
@@ -12085,35 +12387,45 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB305_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB305_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
@@ -12121,19 +12433,23 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v16
+; ZVFHMIN-NEXT:    vmv8r.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -12158,43 +12474,32 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a3, 8
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a2
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v24, v16, a3, v0.t
 ; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    mv a3, a0
-; ZVFHMIN-NEXT:    bltu a0, a2, .LBB306_2
-; ZVFHMIN-NEXT:  # %bb.1:
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:  .LBB306_2:
+; ZVFHMIN-NEXT:    vmv4r.v v20, v28
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    mv a5, a4
@@ -12202,37 +12507,77 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    addi a4, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    bltu a0, a2, .LBB306_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:  .LBB306_2:
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a4, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v4, v12
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
 ; ZVFHMIN-NEXT:    sub a2, a0, a2
+; ZVFHMIN-NEXT:    srli a1, a1, 2
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 5
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a0, a0, a2
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFHMIN-NEXT:    addi a0, a0, -1
 ; ZVFHMIN-NEXT:    and a0, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
@@ -12241,28 +12586,23 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -12287,125 +12627,118 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 5
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a2, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a2, a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v24, a1, v0.t
+; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
+; ZVFHMIN-NEXT:    lui a3, 8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a2
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a3, v0.t
 ; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    vmv4r.v v28, v20
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 4
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 3
-; ZVFHMIN-NEXT:    add a3, a4, a3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    mv a3, a0
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a0, a2, .LBB307_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a3, a2
 ; ZVFHMIN-NEXT:  .LBB307_2:
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    mv a5, a4
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a5, a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v0, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v4, v20
 ; ZVFHMIN-NEXT:    addi a4, sp, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a5, a4, 3
-; ZVFHMIN-NEXT:    add a4, a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 4
-; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 3
-; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 5
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sub a2, a0, a2
+; ZVFHMIN-NEXT:    srli a1, a1, 2
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    sltu a0, a0, a2
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFHMIN-NEXT:    addi a0, a0, -1
 ; ZVFHMIN-NEXT:    and a0, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a2, a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
@@ -12413,8 +12746,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 5
-; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -12439,107 +12774,118 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
-; ZVFHMIN-NEXT:    lui a1, 8
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vmv8r.v v8, v24
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 5
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB308_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB308_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
+; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -12564,69 +12910,78 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    lui a1, 8
+; ZVFHMIN-NEXT:    slli a1, a3, 1
+; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a1
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
-; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v8, v16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 5
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8
-; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
@@ -12639,16 +12994,16 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB309_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
@@ -12664,7 +13019,10 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
 ; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfmadd-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmadd-constrained-sdnode.ll
index 7c79706419fcad..1d471ab2404b17 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmadd-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmadd-constrained-sdnode.ll
@@ -38,12 +38,12 @@ define <vscale x 1 x bfloat> @vfmadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v12, v9, v11
+; CHECK-NEXT:    vfmadd.vv v12, v11, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
@@ -75,12 +75,12 @@ define <vscale x 2 x bfloat> @vfmadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfmadd.vv v9, v8, v11
+; CHECK-NEXT:    vfmadd.vv v9, v11, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
@@ -112,14 +112,14 @@ define <vscale x 4 x bfloat> @vfmadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v14, v12
+; CHECK-NEXT:    vfmadd.vv v14, v12, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v14
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x bfloat> poison, bfloat %c, i32 0
   %splat = shufflevector <vscale x 4 x bfloat> %head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
@@ -149,14 +149,14 @@ define <vscale x 8 x bfloat> @vfmadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfmadd.vv v24, v20, v16
+; CHECK-NEXT:    vfmadd.vv v20, v16, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v20
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x bfloat> poison, bfloat %c, i32 0
   %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -200,31 +200,16 @@ define <vscale x 16 x bfloat> @vfmadd_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <
 define <vscale x 16 x bfloat> @vfmadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, bfloat %c) strictfp {
 ; CHECK-LABEL: vfmadd_vf_nxv16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v0, v24
+; CHECK-NEXT:    vfmadd.vv v0, v24, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x bfloat> poison, bfloat %c, i32 0
   %splat = shufflevector <vscale x 16 x bfloat> %head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
@@ -242,31 +227,30 @@ define <vscale x 32 x bfloat> @vfmadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    vl8re16.v v0, (a0)
-; CHECK-NEXT:    vmv8r.v v24, v16
+; CHECK-NEXT:    vmv8r.v v0, v16
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    vl8re16.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v16, v8
-; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v8, v0
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
@@ -278,41 +262,31 @@ define <vscale x 32 x bfloat> @vfmadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v0, v16, v24
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmadd.vv v0, v8, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v16, v8, v24
@@ -347,49 +321,50 @@ define <vscale x 32 x bfloat> @vfmadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fmv.x.h a0, fa0
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v24, a0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v24
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v0, v16, v24
+; CHECK-NEXT:    vfmadd.vv v24, v16, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v4
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v8, v24
+; CHECK-NEXT:    vfmadd.vv v16, v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
@@ -440,12 +415,12 @@ define <vscale x 1 x half> @vfmadd_vf_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -490,12 +465,12 @@ define <vscale x 2 x half> @vfmadd_vf_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -540,14 +515,14 @@ define <vscale x 4 x half> @vfmadd_vf_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 4 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 4 x half> %head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -590,14 +565,14 @@ define <vscale x 8 x half> @vfmadd_vf_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -654,31 +629,16 @@ define <vscale x 16 x half> @vfmadd_vf_nxv16f16(<vscale x 16 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfmadd_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -704,31 +664,30 @@ define <vscale x 32 x half> @vfmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vl8re16.v v0, (a0)
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v8, v0
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
@@ -740,41 +699,31 @@ define <vscale x 32 x half> @vfmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24
@@ -815,49 +764,50 @@ define <vscale x 32 x half> @vfmadd_vf_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vmv.v.x v24, a0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll
index 088e3a04984fa0..88fd81a5a2f7bc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll
@@ -48,12 +48,12 @@ define <vscale x 1 x bfloat> @vfmadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v12, v9, v11
+; CHECK-NEXT:    vfmadd.vv v12, v11, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
 ; CHECK-NEXT:    ret
@@ -86,12 +86,12 @@ define <vscale x 2 x bfloat> @vfmadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v11, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfmadd.vv v9, v8, v11
+; CHECK-NEXT:    vfmadd.vv v9, v11, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
 ; CHECK-NEXT:    ret
@@ -124,14 +124,14 @@ define <vscale x 4 x bfloat> @vfmadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v10, a0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v14, v12
+; CHECK-NEXT:    vfmadd.vv v14, v12, v10
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v14
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 4 x bfloat> poison, bfloat %c, i32 0
   %splat = shufflevector <vscale x 4 x bfloat> %head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
@@ -162,14 +162,14 @@ define <vscale x 8 x bfloat> @vfmadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, <vsc
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vmv.v.x v8, a0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfmadd.vv v24, v20, v16
+; CHECK-NEXT:    vfmadd.vv v20, v16, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v20
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 8 x bfloat> poison, bfloat %c, i32 0
   %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
@@ -198,31 +198,16 @@ define <vscale x 16 x bfloat> @vfmadd_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <
 define <vscale x 16 x bfloat> @vfmadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, bfloat %c) {
 ; CHECK-LABEL: vfmadd_vf_nxv16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; CHECK-NEXT:    fmv.x.h a0, fa0
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v0, v24
+; CHECK-NEXT:    vfmadd.vv v0, v24, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 16 x bfloat> poison, bfloat %c, i32 0
   %splat = shufflevector <vscale x 16 x bfloat> %head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
@@ -241,18 +226,18 @@ define <vscale x 32 x bfloat> @vfmadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFH-NEXT:    slli a1, a1, 5
 ; ZVFH-NEXT:    sub sp, sp, a1
 ; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFH-NEXT:    vl8re16.v v0, (a0)
-; ZVFH-NEXT:    vmv8r.v v24, v16
+; ZVFH-NEXT:    vmv8r.v v0, v16
+; ZVFH-NEXT:    addi a1, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vmv8r.v v16, v8
+; ZVFH-NEXT:    vl8re16.v v8, (a0)
 ; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 4
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT:    vmv8r.v v16, v8
-; ZVFH-NEXT:    addi a0, sp, 16
 ; ZVFH-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v16
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v16
 ; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 3
 ; ZVFH-NEXT:    mv a1, a0
@@ -260,14 +245,13 @@ define <vscale x 32 x bfloat> @vfmadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFH-NEXT:    add a0, a0, a1
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v24
+; ZVFH-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v0
 ; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 3
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT:    vmv8r.v v8, v0
+; ZVFH-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFH-NEXT:    vfwcvtbf16.f.f.v v0, v8
 ; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 3
@@ -281,33 +265,28 @@ define <vscale x 32 x bfloat> @vfmadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFH-NEXT:    slli a0, a0, 3
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFH-NEXT:    addi a0, sp, 16
-; ZVFH-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vfmadd.vv v0, v8, v24
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    mv a1, a0
+; ZVFH-NEXT:    slli a0, a0, 1
+; ZVFH-NEXT:    add a0, a0, a1
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
 ; ZVFH-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 4
-; ZVFH-NEXT:    add a0, sp, a0
-; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    addi a0, sp, 16
 ; ZVFH-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v20
 ; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 3
-; ZVFH-NEXT:    mv a1, a0
-; ZVFH-NEXT:    slli a0, a0, 1
-; ZVFH-NEXT:    add a0, a0, a1
+; ZVFH-NEXT:    slli a0, a0, 4
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v28
 ; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 3
 ; ZVFH-NEXT:    mv a1, a0
@@ -315,11 +294,6 @@ define <vscale x 32 x bfloat> @vfmadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFH-NEXT:    add a0, a0, a1
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 3
-; ZVFH-NEXT:    add a0, sp, a0
-; ZVFH-NEXT:    addi a0, a0, 16
 ; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFH-NEXT:    vfmadd.vv v16, v8, v24
@@ -342,31 +316,30 @@ define <vscale x 32 x bfloat> @vfmadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vl8re16.v v0, (a0)
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v8, v0
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
@@ -378,41 +351,31 @@ define <vscale x 32 x bfloat> @vfmadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v28
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24
@@ -436,68 +399,91 @@ define <vscale x 32 x bfloat> @vfmadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFH-NEXT:    addi sp, sp, -16
 ; ZVFH-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 5
+; ZVFH-NEXT:    sub sp, sp, a0
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFH-NEXT:    vmv8r.v v0, v16
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vmv8r.v v16, v8
+; ZVFH-NEXT:    fmv.x.h a0, fa0
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; ZVFH-NEXT:    csrr a1, vlenb
+; ZVFH-NEXT:    slli a1, a1, 4
+; ZVFH-NEXT:    add a1, sp, a1
+; ZVFH-NEXT:    addi a1, a1, 16
+; ZVFH-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; ZVFH-NEXT:    csrr a1, vlenb
+; ZVFH-NEXT:    slli a1, a1, 3
+; ZVFH-NEXT:    add a1, sp, a1
+; ZVFH-NEXT:    addi a1, a1, 16
+; ZVFH-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFH-NEXT:    vmv.v.x v24, a0
+; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 3
 ; ZVFH-NEXT:    mv a1, a0
 ; ZVFH-NEXT:    slli a0, a0, 1
 ; ZVFH-NEXT:    add a0, a0, a1
-; ZVFH-NEXT:    sub sp, sp, a0
-; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; ZVFH-NEXT:    vmv8r.v v24, v16
-; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 3
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT:    fmv.x.h a0, fa0
-; ZVFH-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFH-NEXT:    vmv.v.x v16, a0
+; ZVFH-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    mv a1, a0
+; ZVFH-NEXT:    slli a0, a0, 1
+; ZVFH-NEXT:    add a0, a0, a1
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; ZVFH-NEXT:    addi a0, sp, 16
-; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v0
 ; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 4
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v0, v24
-; ZVFH-NEXT:    addi a0, sp, 16
-; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 3
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfmadd.vv v8, v24, v0
+; ZVFH-NEXT:    vmv.v.v v24, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v20
 ; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 4
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; ZVFH-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFH-NEXT:    addi a0, sp, 16
 ; ZVFH-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT:    vfmadd.vv v24, v8, v16
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v0
-; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v24
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v20
 ; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 3
 ; ZVFH-NEXT:    mv a1, a0
 ; ZVFH-NEXT:    slli a0, a0, 1
 ; ZVFH-NEXT:    add a0, a0, a1
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfmadd.vv v16, v8, v0
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v24
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 5
 ; ZVFH-NEXT:    add sp, sp, a0
 ; ZVFH-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFH-NEXT:    addi sp, sp, 16
@@ -509,64 +495,85 @@ define <vscale x 32 x bfloat> @vfmadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vmv.v.x v24, a0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v0, v24
-; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v0
+; ZVFHMIN-NEXT:    vmv.v.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v0
-; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -635,12 +642,12 @@ define <vscale x 1 x half> @vfmadd_vf_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -685,12 +692,12 @@ define <vscale x 2 x half> @vfmadd_vf_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -735,14 +742,14 @@ define <vscale x 4 x half> @vfmadd_vf_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vmv.v.x v9, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 4 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 4 x half> %head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
@@ -785,14 +792,14 @@ define <vscale x 8 x half> @vfmadd_vf_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
@@ -833,31 +840,16 @@ define <vscale x 16 x half> @vfmadd_vf_nxv16f16(<vscale x 16 x half> %va, <vscal
 ;
 ; ZVFHMIN-LABEL: vfmadd_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vmv.v.x v12, a0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -883,31 +875,30 @@ define <vscale x 32 x half> @vfmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vl8re16.v v0, (a0)
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v8, v0
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
@@ -919,41 +910,31 @@ define <vscale x 32 x half> @vfmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24
@@ -983,64 +964,85 @@ define <vscale x 32 x half> @vfmadd_vf_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vmv.v.x v24, a0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
-; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v0
+; ZVFHMIN-NEXT:    vmv.v.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll
index 8ff7453a0e9a7b..db034fb4862d88 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-sdnode.ll
@@ -195,19 +195,34 @@ declare <vscale x 32 x bfloat> @llvm.maxnum.nxv32bf16(<vscale x 32 x bfloat>, <v
 define <vscale x 32 x bfloat> @vfmax_nxv32bf16_vv(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) {
 ; CHECK-LABEL: vfmax_nxv32bf16_vv:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmax.vv v24, v0, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfmax.vv v0, v0, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v16, v16, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.maxnum.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
   ret <vscale x 32 x bfloat> %v
@@ -216,22 +231,39 @@ define <vscale x 32 x bfloat> @vfmax_nxv32bf16_vv(<vscale x 32 x bfloat> %a, <vs
 define <vscale x 32 x bfloat> @vfmax_nxv32bf16_vf(<vscale x 32 x bfloat> %a, bfloat %b) {
 ; CHECK-LABEL: vfmax_nxv32bf16_vf:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    fmv.x.h a0, fa0
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmax.vv v24, v24, v0
+; CHECK-NEXT:    vfmax.vv v0, v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v20
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmax.vv v16, v24, v0
+; CHECK-NEXT:    vfmax.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
@@ -490,19 +522,34 @@ define <vscale x 32 x half> @vfmax_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ;
 ; ZVFHMIN-LABEL: vfmax_nxv32f16_vv:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmax.vv v24, v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmax.vv v0, v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.maxnum.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b)
   ret <vscale x 32 x half> %v
@@ -517,22 +564,39 @@ define <vscale x 32 x half> @vfmax_nxv32f16_vf(<vscale x 32 x half> %a, half %b)
 ;
 ; ZVFHMIN-LABEL: vfmax_nxv32f16_vf:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmax.vv v24, v24, v0
+; ZVFHMIN-NEXT:    vfmax.vv v0, v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v20
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmax.vv v16, v24, v0
+; ZVFHMIN-NEXT:    vfmax.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
index b8d62b04e5c2ca..dafcf8a1410d32 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmax-vp.ll
@@ -185,20 +185,21 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -236,22 +237,23 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v7
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmax.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -516,20 +518,21 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -573,22 +576,23 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll
index de49aed6e52b26..3ee82c33485f6e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-sdnode.ll
@@ -195,19 +195,34 @@ declare <vscale x 32 x bfloat> @llvm.minnum.nxv32bf16(<vscale x 32 x bfloat>, <v
 define <vscale x 32 x bfloat> @vfmin_nxv32bf16_vv(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) {
 ; CHECK-LABEL: vfmin_nxv32bf16_vv:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmin.vv v24, v0, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfmin.vv v0, v0, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v16, v16, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.minnum.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
   ret <vscale x 32 x bfloat> %v
@@ -216,22 +231,39 @@ define <vscale x 32 x bfloat> @vfmin_nxv32bf16_vv(<vscale x 32 x bfloat> %a, <vs
 define <vscale x 32 x bfloat> @vfmin_nxv32bf16_vf(<vscale x 32 x bfloat> %a, bfloat %b) {
 ; CHECK-LABEL: vfmin_nxv32bf16_vf:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    fmv.x.h a0, fa0
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmin.vv v24, v24, v0
+; CHECK-NEXT:    vfmin.vv v0, v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v20
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmin.vv v16, v24, v0
+; CHECK-NEXT:    vfmin.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
@@ -490,19 +522,34 @@ define <vscale x 32 x half> @vfmin_nxv32f16_vv(<vscale x 32 x half> %a, <vscale
 ;
 ; ZVFHMIN-LABEL: vfmin_nxv32f16_vv:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmin.vv v24, v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmin.vv v0, v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.minnum.nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b)
   ret <vscale x 32 x half> %v
@@ -517,22 +564,39 @@ define <vscale x 32 x half> @vfmin_nxv32f16_vf(<vscale x 32 x half> %a, half %b)
 ;
 ; ZVFHMIN-LABEL: vfmin_nxv32f16_vf:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmin.vv v24, v24, v0
+; ZVFHMIN-NEXT:    vfmin.vv v0, v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v20
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmin.vv v16, v24, v0
+; ZVFHMIN-NEXT:    vfmin.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
index d6848943110c58..b3df6572f79369 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmin-vp.ll
@@ -185,20 +185,21 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -236,22 +237,23 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v7
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmin.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -516,20 +518,21 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -573,22 +576,23 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll
index d082f85106f16e..978347fa4fc10a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll
@@ -24,12 +24,12 @@ define <vscale x 1 x half> @vfmsub_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v10, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
@@ -51,12 +51,12 @@ define <vscale x 1 x half> @vfmsub_vf_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
@@ -80,14 +80,14 @@ define <vscale x 2 x half> @vfmsub_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v10, v9, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 2 x half> %vb
   %vd = call <vscale x 2 x half> @llvm.experimental.constrained.fma.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vc, <vscale x 2 x half> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -107,14 +107,14 @@ define <vscale x 2 x half> @vfmsub_vf_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v8, v11
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v11, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 2 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 2 x half> %head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -136,12 +136,12 @@ define <vscale x 4 x half> @vfmsub_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vxor.vx v8, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
@@ -163,12 +163,12 @@ define <vscale x 4 x half> @vfmsub_vf_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vxor.vx v8, v9, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v14, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v12, v14
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
@@ -192,14 +192,14 @@ define <vscale x 8 x half> @vfmsub_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v12, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 8 x half> %va
   %vd = call <vscale x 8 x half> @llvm.experimental.constrained.fma.nxv8f16(<vscale x 8 x half> %vb, <vscale x 8 x half> %vc, <vscale x 8 x half> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -219,12 +219,12 @@ define <vscale x 8 x half> @vfmsub_vf_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v20
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
@@ -256,14 +256,14 @@ define <vscale x 16 x half> @vfmsub_vv_nxv16f16(<vscale x 16 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vxor.vx v8, v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v0, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
@@ -299,14 +299,14 @@ define <vscale x 16 x half> @vfmsub_vf_nxv16f16(<vscale x 16 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs4r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vxor.vx v8, v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl4r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
@@ -342,55 +342,49 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
@@ -425,7 +419,6 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
@@ -433,19 +426,21 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vmv.v.x v24, a0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
@@ -455,10 +450,20 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
@@ -469,15 +474,12 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll
index 9a68da58096d66..9e6b1f6fefd088 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-constrained-sdnode.ll
@@ -190,19 +190,34 @@ define <vscale x 16 x bfloat> @vfmul_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 define <vscale x 32 x bfloat> @vfmul_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) strictfp {
 ; CHECK-LABEL: vfmul_vv_nxv32bf16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmul.vv v24, v0, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfmul.vv v0, v0, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmul.vv v16, v16, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 entry:
   %vc = call <vscale x 32 x bfloat> @llvm.experimental.constrained.fmul.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
@@ -212,22 +227,39 @@ entry:
 define <vscale x 32 x bfloat> @vfmul_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) strictfp {
 ; CHECK-LABEL: vfmul_vf_nxv32bf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    fmv.x.h a0, fa0
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmul.vv v24, v24, v0
+; CHECK-NEXT:    vfmul.vv v0, v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmul.vv v16, v16, v24
+; CHECK-NEXT:    vfmul.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
@@ -485,19 +517,34 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv32f16:
 ; ZVFHMIN:       # %bb.0: # %entry
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v24, v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmul.vv v0, v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
 entry:
   %vc = call <vscale x 32 x half> @llvm.experimental.constrained.fmul.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
@@ -513,22 +560,39 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ;
 ; ZVFHMIN-LABEL: vfmul_vf_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v24, v24, v0
+; ZVFHMIN-NEXT:    vfmul.vv v0, v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll
index bbacbaa8e5e491..fb8ed3f943fff6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-sdnode.ll
@@ -204,19 +204,34 @@ define <vscale x 16 x bfloat> @vfmul_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 define <vscale x 32 x bfloat> @vfmul_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) {
 ; CHECK-LABEL: vfmul_vv_nxv32bf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmul.vv v24, v0, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfmul.vv v0, v0, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmul.vv v16, v16, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %vc = fmul <vscale x 32 x bfloat> %va, %vb
   ret <vscale x 32 x bfloat> %vc
@@ -225,22 +240,39 @@ define <vscale x 32 x bfloat> @vfmul_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 define <vscale x 32 x bfloat> @vfmul_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) {
 ; CHECK-LABEL: vfmul_vf_nxv32bf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    fmv.x.h a0, fa0
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmul.vv v24, v24, v0
+; CHECK-NEXT:    vfmul.vv v0, v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v20
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmul.vv v16, v24, v0
+; CHECK-NEXT:    vfmul.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
@@ -512,19 +544,34 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfmul_vv_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v24, v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmul.vv v0, v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %vc = fmul <vscale x 32 x half> %va, %vb
   ret <vscale x 32 x half> %vc
@@ -539,22 +586,39 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ;
 ; ZVFHMIN-LABEL: vfmul_vf_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v24, v24, v0
+; ZVFHMIN-NEXT:    vfmul.vv v0, v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v20
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v0
+; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
index eacc8676f34833..f4a236df4c9e4f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
@@ -497,20 +497,21 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -554,22 +555,23 @@ define <vscale x 32 x half> @vfmul_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -612,23 +614,24 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v24, v8
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    slli a3, a1, 3
+; ZVFHMIN-NEXT:    add a1, a3, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    add a4, sp, a4
@@ -636,18 +639,18 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 3
-; ZVFHMIN-NEXT:    add a2, a4, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a4, a3, 3
+; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -705,31 +708,37 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vmv4r.v v16, v8
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmul.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
index 6f2c5b6e46f86b..d1702268f829fa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmuladd-vp.ll
@@ -1108,10 +1108,10 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 40
+; CHECK-NEXT:    li a3, 48
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    li a3, 24
@@ -1127,27 +1127,33 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a3, a1, 3
 ; CHECK-NEXT:    slli a5, a1, 3
-; CHECK-NEXT:    add a6, a2, a5
-; CHECK-NEXT:    vl8re64.v v8, (a6)
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 4
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
-; CHECK-NEXT:    add a5, a0, a5
 ; CHECK-NEXT:    sub a6, a4, a1
+; CHECK-NEXT:    add a7, a2, a5
+; CHECK-NEXT:    add a5, a0, a5
+; CHECK-NEXT:    vl8re64.v v8, (a7)
+; CHECK-NEXT:    csrr a7, vlenb
+; CHECK-NEXT:    slli a7, a7, 3
+; CHECK-NEXT:    add a7, sp, a7
+; CHECK-NEXT:    addi a7, a7, 16
+; CHECK-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
 ; CHECK-NEXT:    sltu a7, a4, a6
 ; CHECK-NEXT:    addi a7, a7, -1
 ; CHECK-NEXT:    vl8re64.v v8, (a5)
-; CHECK-NEXT:    vl8re64.v v16, (a2)
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    li t0, 40
+; CHECK-NEXT:    mul a5, a5, t0
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v8, (a2)
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v16, (a0)
+; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    and a0, a7, a6
@@ -1158,14 +1164,20 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    li a3, 40
+; CHECK-NEXT:    mul a2, a2, a3
+; CHECK-NEXT:    add a2, sp, a2
+; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    li a2, 40
 ; CHECK-NEXT:    mul a0, a0, a2
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
@@ -1181,23 +1193,22 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    li a1, 40
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 40
+; CHECK-NEXT:    li a1, 48
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
@@ -1214,58 +1225,69 @@ define <vscale x 16 x double> @vfma_vv_nxv16f64_unmasked(<vscale x 16 x double>
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    add a5, a2, a3
-; CHECK-NEXT:    vl8re64.v v24, (a5)
+; CHECK-NEXT:    vl8re64.v v8, (a5)
+; CHECK-NEXT:    csrr a5, vlenb
+; CHECK-NEXT:    slli a5, a5, 3
+; CHECK-NEXT:    add a5, sp, a5
+; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    sub a5, a4, a1
 ; CHECK-NEXT:    add a3, a0, a3
-; CHECK-NEXT:    vl8re64.v v16, (a3)
-; CHECK-NEXT:    sub a3, a4, a1
-; CHECK-NEXT:    sltu a5, a4, a3
+; CHECK-NEXT:    vl8re64.v v24, (a3)
+; CHECK-NEXT:    sltu a3, a4, a5
 ; CHECK-NEXT:    vl8re64.v v8, (a2)
 ; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8re64.v v0, (a0)
-; CHECK-NEXT:    addi a5, a5, -1
-; CHECK-NEXT:    and a3, a5, a3
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a5
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v8, v24
+; CHECK-NEXT:    vfmadd.vv v24, v16, v8
 ; CHECK-NEXT:    bltu a4, a1, .LBB93_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a4, a1
 ; CHECK-NEXT:  .LBB93_2:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v0, v24, v8
+; CHECK-NEXT:    vfmadd.vv v0, v16, v8
 ; CHECK-NEXT:    vmv.v.v v8, v0
+; CHECK-NEXT:    vmv8r.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
index 6fa6c26890c3e0..343098e87649ea 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfneg-vp.ll
@@ -453,12 +453,12 @@ define <vscale x 16 x double> @vfneg_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfneg.v v16, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB32_2

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll
index 73adbba3df9216..3705e73fda492e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmadd-constrained-sdnode.ll
@@ -173,8 +173,8 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16(<vscale x 4 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
+; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
@@ -233,8 +233,8 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16(<vscale x 8 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
+; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
@@ -293,8 +293,8 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16(<vscale x 16 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
@@ -329,74 +329,76 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a2, 24
+; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8re16.v v16, (a0)
 ; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v24, v24, a0
+; ZVFHMIN-NEXT:    vxor.vx v8, v0, a0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vxor.vx v0, v16, a0
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v0, v24
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v28
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v0, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v8, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -422,82 +424,69 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a0
+; ZVFHMIN-NEXT:    vmv.v.x v0, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v24, v24, a0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v16, v8, a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vxor.vx v8, v0, a0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfnmsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfnmsub-constrained-sdnode.ll
index 0d6d24d247af4b..80edf0e3a4d811 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfnmsub-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfnmsub-constrained-sdnode.ll
@@ -24,14 +24,14 @@ define <vscale x 1 x half> @vfnmsub_vv_nxv1f16(<vscale x 1 x half> %va, <vscale
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v10
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v10, v11, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v10, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 1 x half> %va
   %vd = call <vscale x 1 x half> @llvm.experimental.constrained.fma.nxv1f16(<vscale x 1 x half> %neg, <vscale x 1 x half> %vb, <vscale x 1 x half> %vc, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -51,14 +51,14 @@ define <vscale x 1 x half> @vfnmsub_vf_nxv1f16(<vscale x 1 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 1 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 1 x half> %head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
@@ -80,14 +80,14 @@ define <vscale x 2 x half> @vfnmsub_vv_nxv2f16(<vscale x 2 x half> %va, <vscale
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 2 x half> %va
   %vd = call <vscale x 2 x half> @llvm.experimental.constrained.fma.nxv2f16(<vscale x 2 x half> %neg, <vscale x 2 x half> %vc, <vscale x 2 x half> %vb, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -107,14 +107,14 @@ define <vscale x 2 x half> @vfnmsub_vf_nxv2f16(<vscale x 2 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v9, v11, v8
+; ZVFHMIN-NEXT:    vfmadd.vv v12, v9, v11
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 2 x half> poison, half %c, i32 0
   %splat = shufflevector <vscale x 2 x half> %head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
@@ -136,14 +136,14 @@ define <vscale x 4 x half> @vfnmsub_vv_nxv4f16(<vscale x 4 x half> %va, <vscale
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
 ; ZVFHMIN-NEXT:    vxor.vx v9, v9, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v10, v12, v14
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
   %neg = fneg <vscale x 4 x half> %vb
   %vd = call <vscale x 4 x half> @llvm.experimental.constrained.fma.nxv4f16(<vscale x 4 x half> %neg, <vscale x 4 x half> %va, <vscale x 4 x half> %vc, metadata !"round.dynamic", metadata !"fpexcept.strict")
@@ -163,12 +163,12 @@ define <vscale x 4 x half> @vfnmsub_vf_nxv4f16(<vscale x 4 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v10, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vxor.vx v9, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v9
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v14, v12, v10
+; ZVFHMIN-NEXT:    vfmadd.vv v14, v10, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v14
 ; ZVFHMIN-NEXT:    ret
@@ -192,12 +192,12 @@ define <vscale x 8 x half> @vfnmsub_vv_nxv8f16(<vscale x 8 x half> %va, <vscale
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v10, v10, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vxor.vx v8, v10, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v20
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v20, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    ret
@@ -219,12 +219,12 @@ define <vscale x 8 x half> @vfnmsub_vf_nxv8f16(<vscale x 8 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v12, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vxor.vx v8, v12, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v20, v16, v12
+; ZVFHMIN-NEXT:    vfmadd.vv v20, v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v20
 ; ZVFHMIN-NEXT:    ret
@@ -248,12 +248,12 @@ define <vscale x 16 x half> @vfnmsub_vv_nxv16f16(<vscale x 16 x half> %va, <vsca
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vxor.vx v12, v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
@@ -275,12 +275,12 @@ define <vscale x 16 x half> @vfnmsub_vf_nxv16f16(<vscale x 16 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a0
 ; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a0
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vxor.vx v12, v16, a0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    ret
@@ -306,79 +306,58 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    li a2, 24
+; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
-; ZVFHMIN-NEXT:    vmv8r.v v0, v8
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v24, a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v0, v24, a0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v0, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v12
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v20
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -401,79 +380,60 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
-; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
-; ZVFHMIN-NEXT:    lui a0, 8
-; ZVFHMIN-NEXT:    vxor.vx v0, v16, a0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v8, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vmv8r.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a0
+; ZVFHMIN-NEXT:    lui a0, 8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vxor.vx v0, v24, a0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v20
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
index 5962d38b1baa72..341fe678183b6f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfpext-vp.ll
@@ -99,9 +99,9 @@ define <vscale x 32 x float> @vfpext_nxv32f16_nxv32f32(<vscale x 32 x half> %a,
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
index 37e14783d18733..d707b4254d3e14 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
@@ -628,17 +628,17 @@ define <vscale x 32 x i1> @vfptosi_nxv32bf16_nxv32i1(<vscale x 32 x bfloat> %va)
 ; CHECK-LABEL: vfptosi_nxv32bf16_nxv32i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vfncvt.rtz.x.f.w v12, v16
-; CHECK-NEXT:    vand.vi v12, v12, 1
-; CHECK-NEXT:    vmsne.vi v16, v12, 0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vfncvt.rtz.x.f.w v8, v24
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v8, v16
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v12, v24
+; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vand.vi v12, v12, 1
+; CHECK-NEXT:    vmsne.vi v16, v8, 0
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v16, a0
 ; CHECK-NEXT:    ret
@@ -650,17 +650,17 @@ define <vscale x 32 x i1> @vfptoui_nxv32bf16_nxv32i1(<vscale x 32 x bfloat> %va)
 ; CHECK-LABEL: vfptoui_nxv32bf16_nxv32i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vfncvt.rtz.xu.f.w v12, v16
-; CHECK-NEXT:    vand.vi v12, v12, 1
-; CHECK-NEXT:    vmsne.vi v16, v12, 0
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vfncvt.rtz.xu.f.w v8, v24
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v8, v16
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v12, v24
+; CHECK-NEXT:    add a1, a0, a0
 ; CHECK-NEXT:    vand.vi v8, v8, 1
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vand.vi v12, v12, 1
+; CHECK-NEXT:    vmsne.vi v16, v8, 0
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v16, a0
 ; CHECK-NEXT:    ret
@@ -673,12 +673,12 @@ define <vscale x 32 x i8> @vfptosi_nxv32bf16_nxv32i8(<vscale x 32 x bfloat> %va)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfncvt.rtz.x.f.w v24, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v12, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v24, 0
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vfncvt.rtz.x.f.w v12, v16
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v12, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v12, 0
 ; CHECK-NEXT:    ret
@@ -691,12 +691,12 @@ define <vscale x 32 x i8> @vfptoui_nxv32bf16_nxv32i8(<vscale x 32 x bfloat> %va)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfncvt.rtz.xu.f.w v24, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v12, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v24, 0
+; CHECK-NEXT:    vnsrl.wi v8, v12, 0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vfncvt.rtz.xu.f.w v12, v16
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v12, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v10, v12, 0
 ; CHECK-NEXT:    ret
@@ -1648,17 +1648,17 @@ define <vscale x 32 x i1> @vfptosi_nxv32f16_nxv32i1(<vscale x 32 x half> %va) {
 ; ZVFHMIN-LABEL: vfptosi_nxv32f16_nxv32i1:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    srli a0, a0, 2
-; ZVFHMIN-NEXT:    add a1, a0, a0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v12, v16
-; ZVFHMIN-NEXT:    vand.vi v12, v12, 1
-; ZVFHMIN-NEXT:    vmsne.vi v16, v12, 0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v24
+; ZVFHMIN-NEXT:    srli a0, a0, 2
+; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v12, v24
+; ZVFHMIN-NEXT:    add a1, a0, a0
 ; ZVFHMIN-NEXT:    vand.vi v8, v8, 1
-; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
+; ZVFHMIN-NEXT:    vand.vi v12, v12, 1
+; ZVFHMIN-NEXT:    vmsne.vi v16, v8, 0
+; ZVFHMIN-NEXT:    vmsne.vi v0, v12, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v0, v16, a0
 ; ZVFHMIN-NEXT:    ret
@@ -1678,17 +1678,17 @@ define <vscale x 32 x i1> @vfptoui_nxv32f16_nxv32i1(<vscale x 32 x half> %va) {
 ; ZVFHMIN-LABEL: vfptoui_nxv32f16_nxv32i1:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    srli a0, a0, 2
-; ZVFHMIN-NEXT:    add a1, a0, a0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v12, v16
-; ZVFHMIN-NEXT:    vand.vi v12, v12, 1
-; ZVFHMIN-NEXT:    vmsne.vi v16, v12, 0
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v24
+; ZVFHMIN-NEXT:    srli a0, a0, 2
+; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v16
+; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v12, v24
+; ZVFHMIN-NEXT:    add a1, a0, a0
 ; ZVFHMIN-NEXT:    vand.vi v8, v8, 1
-; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
+; ZVFHMIN-NEXT:    vand.vi v12, v12, 1
+; ZVFHMIN-NEXT:    vmsne.vi v16, v8, 0
+; ZVFHMIN-NEXT:    vmsne.vi v0, v12, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v0, v16, a0
 ; ZVFHMIN-NEXT:    ret
@@ -1708,12 +1708,12 @@ define <vscale x 32 x i8> @vfptosi_nxv32f16_nxv32i8(<vscale x 32 x half> %va) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; ZVFHMIN-NEXT:    vnsrl.wi v8, v24, 0
+; ZVFHMIN-NEXT:    vnsrl.wi v8, v12, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v12, v16
+; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v12, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v10, v12, 0
 ; ZVFHMIN-NEXT:    ret
@@ -1733,12 +1733,12 @@ define <vscale x 32 x i8> @vfptoui_nxv32f16_nxv32i8(<vscale x 32 x half> %va) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v12, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
-; ZVFHMIN-NEXT:    vnsrl.wi v8, v24, 0
+; ZVFHMIN-NEXT:    vnsrl.wi v8, v12, 0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v12, v16
+; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v12, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; ZVFHMIN-NEXT:    vnsrl.wi v10, v12, 0
 ; ZVFHMIN-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
index 23d47cc3cb1fd4..cf195c7c0935e4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptosi-vp.ll
@@ -508,42 +508,26 @@ declare <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x flo
 define <vscale x 32 x i16> @vfptosi_nxv32i16_nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptosi_nxv32i16_nxv32f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvt.rtz.x.f.w v20, v24, v0.t
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v28, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB34_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB34_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvt.rtz.x.f.w v16, v8, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vfncvt.rtz.x.f.w v24, v8, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v24
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i16> %v
@@ -557,9 +541,9 @@ define <vscale x 32 x i32> @vfptosi_nxv32i32_nxv32f32(<vscale x 32 x float> %va,
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
index acd360f06046f4..952d28604b86c6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoui-vp.ll
@@ -508,42 +508,26 @@ declare <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x flo
 define <vscale x 32 x i16> @vfptoui_nxv32i16_nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfptoui_nxv32i16_nxv32f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvt.rtz.xu.f.w v20, v24, v0.t
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v28, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB34_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB34_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvt.rtz.xu.f.w v16, v8, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vfncvt.rtz.xu.f.w v24, v8, v0.t
+; CHECK-NEXT:    vmv8r.v v8, v24
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x float> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x i16> %v
@@ -557,9 +541,9 @@ define <vscale x 32 x i32> @vfptoui_nxv32i32_nxv32f32(<vscale x 32 x float> %va,
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
index b2bfb10d39df30..874813f0575953 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
@@ -107,12 +107,12 @@ define <vscale x 16 x float> @vfptrunc_nxv16f32_nxv16f64(<vscale x 16 x double>
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    addi a3, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
@@ -157,35 +157,35 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a3, a1, 3
-; CHECK-NEXT:    srli a4, a1, 2
-; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v16, v0, a4
-; CHECK-NEXT:    slli a4, a1, 3
-; CHECK-NEXT:    add a4, a0, a4
-; CHECK-NEXT:    vl8re64.v v8, (a4)
+; CHECK-NEXT:    srli a5, a1, 2
+; CHECK-NEXT:    slli a6, a1, 3
 ; CHECK-NEXT:    slli a4, a1, 1
+; CHECK-NEXT:    vsetvli a7, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v16, v0, a5
+; CHECK-NEXT:    add a6, a0, a6
 ; CHECK-NEXT:    sub a5, a2, a4
+; CHECK-NEXT:    vl8re64.v v24, (a6)
 ; CHECK-NEXT:    sltu a6, a2, a5
 ; CHECK-NEXT:    addi a6, a6, -1
 ; CHECK-NEXT:    and a5, a6, a5
 ; CHECK-NEXT:    sub a6, a5, a1
 ; CHECK-NEXT:    sltu a7, a5, a6
 ; CHECK-NEXT:    addi a7, a7, -1
-; CHECK-NEXT:    vl8re64.v v24, (a0)
+; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v16, a3
 ; CHECK-NEXT:    and a0, a7, a6
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfncvt.f.f.w v20, v8, v0.t
+; CHECK-NEXT:    vfncvt.f.f.w v20, v24, v0.t
 ; CHECK-NEXT:    bltu a5, a1, .LBB8_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a5, a1
 ; CHECK-NEXT:  .LBB8_2:
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v6, v7, a3
-; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, a5, e32, m4, ta, ma
-; CHECK-NEXT:    vfncvt.f.f.w v16, v24, v0.t
+; CHECK-NEXT:    vfncvt.f.f.w v16, v8, v0.t
 ; CHECK-NEXT:    bltu a2, a4, .LBB8_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a2, a4

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll
index d92db0b5a3a7bc..d5e65e2c8fd3f2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-constrained-sdnode.ll
@@ -92,13 +92,13 @@ define <vscale x 32 x bfloat> @vfsqrt_nxv32bf16(<vscale x 32 x bfloat> %v) stric
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v16, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsqrt.v v16, v16
+; CHECK-NEXT:    vfsqrt.v v16, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    ret
@@ -229,13 +229,13 @@ define <vscale x 32 x half> @vfsqrt_nxv32f16(<vscale x 32 x half> %v) strictfp {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsqrt.v v16, v16
+; ZVFHMIN-NEXT:    vfsqrt.v v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll
index a51b0e4efecf23..4d761981aac97f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-sdnode.ll
@@ -87,13 +87,13 @@ define <vscale x 32 x bfloat> @vfsqrt_nxv32bf16(<vscale x 32 x bfloat> %v) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v16, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsqrt.v v16, v16
+; CHECK-NEXT:    vfsqrt.v v16, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    ret
@@ -224,13 +224,13 @@ define <vscale x 32 x half> @vfsqrt_nxv32f16(<vscale x 32 x half> %v) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsqrt.v v16, v16
+; ZVFHMIN-NEXT:    vfsqrt.v v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
index 00542284ebaee2..8edcf23988c7fb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsqrt-vp.ll
@@ -170,16 +170,16 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; CHECK-NEXT:    vmv1r.v v16, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
@@ -202,19 +202,19 @@ define <vscale x 32 x bfloat> @vfsqrt_vv_nxv32bf16_unmasked(<vscale x 32 x bfloa
 ; CHECK-LABEL: vfsqrt_vv_nxv32bf16_unmasked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v16
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v16, a2
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -455,16 +455,16 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vmv1r.v v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
@@ -493,19 +493,19 @@ define <vscale x 32 x half> @vfsqrt_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-LABEL: vfsqrt_vv_nxv32f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v16
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v16, a2
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsqrt.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -752,12 +752,12 @@ define <vscale x 16 x double> @vfsqrt_vv_nxv16f64(<vscale x 16 x double> %va, <v
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfsqrt.v v16, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll
index 9b24b1df0f064b..fbb14ab43e0759 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-constrained-sdnode.ll
@@ -209,19 +209,34 @@ define <vscale x 16 x bfloat> @vfsub_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) strictfp {
 ; CHECK-LABEL: vfsub_vv_nxv32bf16:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v24, v0, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfsub.vv v0, v0, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v16, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 entry:
   %vc = call <vscale x 32 x bfloat> @llvm.experimental.constrained.fsub.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
@@ -231,22 +246,39 @@ entry:
 define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) strictfp {
 ; CHECK-LABEL: vfsub_vf_nxv32bf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    fmv.x.h a0, fa0
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v24, v24, v0
+; CHECK-NEXT:    vfsub.vv v0, v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v16, v24
+; CHECK-NEXT:    vfsub.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
@@ -529,19 +561,34 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv32f16:
 ; ZVFHMIN:       # %bb.0: # %entry
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v24, v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfsub.vv v0, v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
 entry:
   %vc = call <vscale x 32 x half> @llvm.experimental.constrained.fsub.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, metadata !"round.dynamic", metadata !"fpexcept.ignore")
@@ -557,22 +604,39 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ;
 ; ZVFHMIN-LABEL: vfsub_vf_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v24, v24, v0
+; ZVFHMIN-NEXT:    vfsub.vv v0, v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll
index a2137eaa7a9585..f80644777c72a4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-sdnode.ll
@@ -204,19 +204,34 @@ define <vscale x 16 x bfloat> @vfsub_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) {
 ; CHECK-LABEL: vfsub_vv_nxv32bf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v24, v0, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vfsub.vv v0, v0, v8
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v16, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %vc = fsub <vscale x 32 x bfloat> %va, %vb
   ret <vscale x 32 x bfloat> %vc
@@ -225,22 +240,39 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) {
 ; CHECK-LABEL: vfsub_vf_nxv32bf16:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    fmv.x.h a0, fa0
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vmv.v.x v8, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v24, v24, v0
+; CHECK-NEXT:    vfsub.vv v0, v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v20
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v24, v0
+; CHECK-NEXT:    vfsub.vv v16, v24, v16
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
@@ -512,19 +544,34 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfsub_vv_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v24, v0, v24
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfsub.vv v0, v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %vc = fsub <vscale x 32 x half> %va, %vb
   ret <vscale x 32 x half> %vc
@@ -539,22 +586,39 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ;
 ; ZVFHMIN-LABEL: vfsub_vf_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a0
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v24, v24, v0
+; ZVFHMIN-NEXT:    vfsub.vv v0, v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v20
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v0
+; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
index 2d34bfff95c4cf..25a80e66c4a527 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
@@ -375,20 +375,21 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -426,22 +427,23 @@ define <vscale x 32 x bfloat> @vfsub_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v7
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -478,23 +480,24 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; CHECK-NEXT:    vmv8r.v v24, v8
+; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v16, a1
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 3
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    slli a3, a1, 3
+; CHECK-NEXT:    add a1, a3, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    csrr a4, vlenb
 ; CHECK-NEXT:    slli a4, a4, 3
 ; CHECK-NEXT:    add a4, sp, a4
@@ -502,18 +505,18 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a4, a2, 3
-; CHECK-NEXT:    add a2, a4, a2
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a4, a3, 3
+; CHECK-NEXT:    add a3, a4, a3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
@@ -565,31 +568,37 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a1
 ; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmset.m v7
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vmv.v.x v16, a1
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    sltu a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    vmset.m v24
+; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vmv4r.v v16, v8
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vfsub.vv v16, v8, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    bltu a0, a1, .LBB23_2
@@ -1110,20 +1119,21 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1167,22 +1177,23 @@ define <vscale x 32 x half> @vfsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1225,23 +1236,24 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v24, v8
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    slli a3, a1, 3
+; ZVFHMIN-NEXT:    add a1, a3, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    add a4, sp, a4
@@ -1249,18 +1261,18 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 3
-; ZVFHMIN-NEXT:    add a2, a4, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a4, a3, 3
+; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
@@ -1318,31 +1330,37 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a1
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
-; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    sltu a4, a0, a3
-; ZVFHMIN-NEXT:    addi a4, a4, -1
-; ZVFHMIN-NEXT:    and a3, a4, a3
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
-; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    vmv4r.v v16, v8
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfsub.vv v16, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB47_2

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll
index 0f76968485fb43..4265663c1feee9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vitofp-sdnode.ll
@@ -147,10 +147,10 @@ define <vscale x 32 x bfloat> @vsitofp_nxv32i1_nxv32bf16(<vscale x 32 x i1> %va)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vmerge.vim v8, v12, -1, v0
-; CHECK-NEXT:    vfwcvt.f.x.v v16, v8
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vmerge.vim v8, v12, -1, v0
 ; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    vfwcvt.f.x.v v16, v8
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
@@ -168,10 +168,10 @@ define <vscale x 32 x bfloat> @vuitofp_nxv32i1_nxv32bf16(<vscale x 32 x i1> %va)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
-; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
 ; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    vfwcvt.f.xu.v v16, v8
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
@@ -339,12 +339,11 @@ define <vscale x 32 x bfloat> @vsitofp_nxv32i8_nxv32bf16(<vscale x 32 x i8> %va)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vsext.vf2 v12, v8
+; CHECK-NEXT:    vsext.vf2 v16, v10
 ; CHECK-NEXT:    vfwcvt.f.x.v v24, v12
-; CHECK-NEXT:    vfncvtbf16.f.f.w v16, v24
-; CHECK-NEXT:    vsext.vf2 v12, v10
-; CHECK-NEXT:    vfwcvt.f.x.v v24, v12
-; CHECK-NEXT:    vfncvtbf16.f.f.w v20, v24
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT:    vfwcvt.f.x.v v24, v16
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
 ; CHECK-NEXT:    ret
   %evec = sitofp <vscale x 32 x i8> %va to <vscale x 32 x bfloat>
   ret <vscale x 32 x bfloat> %evec
@@ -355,12 +354,11 @@ define <vscale x 32 x bfloat> @vuitofp_nxv32i8_nxv32bf16(<vscale x 32 x i8> %va)
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vzext.vf2 v12, v8
+; CHECK-NEXT:    vzext.vf2 v16, v10
 ; CHECK-NEXT:    vfwcvt.f.xu.v v24, v12
-; CHECK-NEXT:    vfncvtbf16.f.f.w v16, v24
-; CHECK-NEXT:    vzext.vf2 v12, v10
-; CHECK-NEXT:    vfwcvt.f.xu.v v24, v12
-; CHECK-NEXT:    vfncvtbf16.f.f.w v20, v24
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
+; CHECK-NEXT:    vfwcvt.f.xu.v v24, v16
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v24
 ; CHECK-NEXT:    ret
   %evec = uitofp <vscale x 32 x i8> %va to <vscale x 32 x bfloat>
   ret <vscale x 32 x bfloat> %evec
@@ -1157,10 +1155,10 @@ define <vscale x 32 x half> @vsitofp_nxv32i1_nxv32f16(<vscale x 32 x i1> %va) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v12, 0
-; ZVFHMIN-NEXT:    vmerge.vim v8, v12, -1, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.x.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    vmerge.vim v8, v12, -1, v0
 ; ZVFHMIN-NEXT:    srli a0, a0, 2
+; ZVFHMIN-NEXT:    vfwcvt.f.x.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
@@ -1186,10 +1184,10 @@ define <vscale x 32 x half> @vuitofp_nxv32i1_nxv32f16(<vscale x 32 x i1> %va) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.i v12, 0
-; ZVFHMIN-NEXT:    vmerge.vim v8, v12, 1, v0
-; ZVFHMIN-NEXT:    vfwcvt.f.xu.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    vmerge.vim v8, v12, 1, v0
 ; ZVFHMIN-NEXT:    srli a0, a0, 2
+; ZVFHMIN-NEXT:    vfwcvt.f.xu.v v16, v8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
@@ -1648,12 +1646,11 @@ define <vscale x 32 x half> @vsitofp_nxv32i8_nxv32f16(<vscale x 32 x i8> %va) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vsext.vf2 v12, v8
+; ZVFHMIN-NEXT:    vsext.vf2 v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.x.v v24, v12
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
-; ZVFHMIN-NEXT:    vsext.vf2 v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.x.v v24, v12
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v24
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.x.v v24, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    ret
   %evec = sitofp <vscale x 32 x i8> %va to <vscale x 32 x half>
   ret <vscale x 32 x half> %evec
@@ -1671,12 +1668,11 @@ define <vscale x 32 x half> @vuitofp_nxv32i8_nxv32f16(<vscale x 32 x i8> %va) {
 ; ZVFHMIN:       # %bb.0:
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vzext.vf2 v12, v8
+; ZVFHMIN-NEXT:    vzext.vf2 v16, v10
 ; ZVFHMIN-NEXT:    vfwcvt.f.xu.v v24, v12
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
-; ZVFHMIN-NEXT:    vzext.vf2 v12, v10
-; ZVFHMIN-NEXT:    vfwcvt.f.xu.v v24, v12
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v24
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.xu.v v24, v16
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
 ; ZVFHMIN-NEXT:    ret
   %evec = uitofp <vscale x 32 x i8> %va to <vscale x 32 x half>
   ret <vscale x 32 x half> %evec

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll
index 1a01a9bf77cff5..318a5bc92a779e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll
@@ -14,10 +14,10 @@ define <2 x i32> @vdot_lane_s32(<2 x i32> noundef %var_1, <8 x i8> noundef %var_
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v11, 0
 ; CHECK-NEXT:    vnsrl.wi v9, v11, 16
+; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vwadd.vv v10, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v10, 0
-; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vnsrl.wx v9, v10, a0
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
index 1f778dc1d5c884..0b553d3cd6fdf4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmax-vp.ll
@@ -11,10 +11,10 @@ define <vscale x 8 x i7> @vmax_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
 ; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
 ; CHECK-NEXT:    vmax.vv v8, v8, v9, v0.t
@@ -977,9 +977,9 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <v
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    srli a3, a2, 2
+; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
-; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a3, a1, a2
 ; CHECK-NEXT:    sltu a4, a1, a3
 ; CHECK-NEXT:    addi a4, a4, -1
@@ -1036,10 +1036,10 @@ define <vscale x 32 x i32> @vmax_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    srli a3, a1, 2
 ; CHECK-NEXT:    slli a2, a1, 1
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    sub a3, a1, a2
 ; CHECK-NEXT:    sltu a4, a1, a3
 ; CHECK-NEXT:    addi a4, a4, -1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
index a2b160b5a0ebb6..f6be882f742062 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmaxu-vp.ll
@@ -10,11 +10,10 @@ define <vscale x 8 x i7> @vmaxu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <
 ; CHECK-LABEL: vmaxu_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vmaxu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
@@ -976,9 +975,9 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    srli a3, a2, 2
+; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
-; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a3, a1, a2
 ; CHECK-NEXT:    sltu a4, a1, a3
 ; CHECK-NEXT:    addi a4, a4, -1
@@ -1035,10 +1034,10 @@ define <vscale x 32 x i32> @vmaxu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    srli a3, a1, 2
 ; CHECK-NEXT:    slli a2, a1, 1
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    sub a3, a1, a2
 ; CHECK-NEXT:    sltu a4, a1, a3
 ; CHECK-NEXT:    addi a4, a4, -1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
index b074888ffffa85..8690014cc2c9df 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmin-vp.ll
@@ -11,10 +11,10 @@ define <vscale x 8 x i7> @vmin_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
 ; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
 ; CHECK-NEXT:    vmin.vv v8, v8, v9, v0.t
@@ -977,9 +977,9 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <v
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    srli a3, a2, 2
+; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
-; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a3, a1, a2
 ; CHECK-NEXT:    sltu a4, a1, a3
 ; CHECK-NEXT:    addi a4, a4, -1
@@ -1036,10 +1036,10 @@ define <vscale x 32 x i32> @vmin_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i3
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    srli a3, a1, 2
 ; CHECK-NEXT:    slli a2, a1, 1
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    sub a3, a1, a2
 ; CHECK-NEXT:    sltu a4, a1, a3
 ; CHECK-NEXT:    addi a4, a4, -1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
index c7afd549d8e921..414807829d5630 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vminu-vp.ll
@@ -10,11 +10,10 @@ define <vscale x 8 x i7> @vminu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <
 ; CHECK-LABEL: vminu_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vminu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret
@@ -976,9 +975,9 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b, <
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    srli a3, a2, 2
+; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a3
-; CHECK-NEXT:    slli a2, a2, 1
 ; CHECK-NEXT:    sub a3, a1, a2
 ; CHECK-NEXT:    sltu a4, a1, a3
 ; CHECK-NEXT:    addi a4, a4, -1
@@ -1035,10 +1034,10 @@ define <vscale x 32 x i32> @vminu_vx_nxv32i32_evl_nx8(<vscale x 32 x i32> %va, i
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 2
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    srli a3, a1, 2
 ; CHECK-NEXT:    slli a2, a1, 1
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    sub a3, a1, a2
 ; CHECK-NEXT:    sltu a4, a1, a3
 ; CHECK-NEXT:    addi a4, a4, -1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll
index ae3195c479ea91..3eb767f90f3bd7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode.ll
@@ -897,8 +897,8 @@ define <vscale x 8 x i64> @vmul_xx_nxv8i64(i64 %a, i64 %b) nounwind {
 ; RV32M-NEXT:    mul a4, a0, a2
 ; RV32M-NEXT:    mul a3, a0, a3
 ; RV32M-NEXT:    mulhu a0, a0, a2
-; RV32M-NEXT:    add a0, a0, a3
 ; RV32M-NEXT:    mul a1, a1, a2
+; RV32M-NEXT:    add a0, a0, a3
 ; RV32M-NEXT:    add a0, a0, a1
 ; RV32M-NEXT:    sw a4, 8(sp)
 ; RV32M-NEXT:    sw a0, 12(sp)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode.ll
index 253cfb040308b4..9ab293faf87d13 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode.ll
@@ -8,9 +8,9 @@ define <vscale x 4 x i1> @srem_eq_fold_nxv4i8(<vscale x 4 x i8> %va) {
 ; CHECK-LABEL: srem_eq_fold_nxv4i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 42
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    li a1, -85
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vmacc.vx v9, a1, v8
 ; CHECK-NEXT:    vsll.vi v8, v9, 7
 ; CHECK-NEXT:    vsrl.vi v9, v9, 1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll
index 07661c57640457..4629db26ca0341 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.s.x.ll
@@ -349,11 +349,11 @@ define <vscale x 1 x i64> @intrinsic_vmv.s.x_x_nxv1i64_bug(<vscale x 1 x i64> %0
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    lw a1, 0(a0)
 ; RV32-NEXT:    lw a0, 4(a0)
-; RV32-NEXT:    sw a1, 8(sp)
-; RV32-NEXT:    sw a0, 12(sp)
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, mu
 ; RV32-NEXT:    vid.v v9
 ; RV32-NEXT:    vmseq.vi v0, v9, 0
+; RV32-NEXT:    sw a1, 8(sp)
+; RV32-NEXT:    sw a0, 12(sp)
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vlse64.v v8, (a0), zero, v0.t
 ; RV32-NEXT:    addi sp, sp, 16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
index e840036c6a3da8..507f5154cf1aca 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-int.ll
@@ -446,9 +446,10 @@ define <vscale x 64 x i8> @test_vp_reverse_nxv64i8_masked(<vscale x 64 x i8> %sr
 ; CHECK-LABEL: test_vp_reverse_nxv64i8_masked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    addi a2, a1, -1
-; CHECK-NEXT:    vsetvli a3, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vid.v v16
+; CHECK-NEXT:    addi a2, a1, -1
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    vrsub.vx v24, v16, a2
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v23, v8, v24
@@ -459,7 +460,6 @@ define <vscale x 64 x i8> @test_vp_reverse_nxv64i8_masked(<vscale x 64 x i8> %sr
 ; CHECK-NEXT:    vrgatherei16.vv v18, v13, v24
 ; CHECK-NEXT:    vrgatherei16.vv v17, v14, v24
 ; CHECK-NEXT:    vrgatherei16.vv v16, v15, v24
-; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a1, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v16, a1, v0.t
@@ -472,9 +472,10 @@ define <vscale x 64 x i8> @test_vp_reverse_nxv64i8(<vscale x 64 x i8> %src, i32
 ; CHECK-LABEL: test_vp_reverse_nxv64i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    addi a2, a1, -1
-; CHECK-NEXT:    vsetvli a3, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vid.v v16
+; CHECK-NEXT:    addi a2, a1, -1
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    vrsub.vx v24, v16, a2
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v23, v8, v24
@@ -485,7 +486,6 @@ define <vscale x 64 x i8> @test_vp_reverse_nxv64i8(<vscale x 64 x i8> %src, i32
 ; CHECK-NEXT:    vrgatherei16.vv v18, v13, v24
 ; CHECK-NEXT:    vrgatherei16.vv v17, v14, v24
 ; CHECK-NEXT:    vrgatherei16.vv v16, v15, v24
-; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub a1, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v16, a1
@@ -498,12 +498,12 @@ define <vscale x 64 x i8> @test_vp_reverse_nxv64i8(<vscale x 64 x i8> %src, i32
 define <vscale x 128 x i8> @test_vp_reverse_nxv128i8(<vscale x 128 x i8> %src, i32 zeroext %evl) {
 ; CHECK-LABEL: test_vp_reverse_nxv128i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a0
-; CHECK-NEXT:    bltu a0, a1, .LBB32_2
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB32_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB32_2:
 ; CHECK-NEXT:    addi sp, sp, -80
 ; CHECK-NEXT:    .cfi_def_cfa_offset 80
@@ -518,21 +518,21 @@ define <vscale x 128 x i8> @test_vp_reverse_nxv128i8(<vscale x 128 x i8> %src, i
 ; CHECK-NEXT:    sub sp, sp, a3
 ; CHECK-NEXT:    andi sp, sp, -64
 ; CHECK-NEXT:    addi a3, sp, 64
-; CHECK-NEXT:    add a4, a0, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    li a5, -1
-; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vsse8.v v8, (a4), a5
-; CHECK-NEXT:    sub a4, a4, a2
-; CHECK-NEXT:    sub a6, a0, a1
-; CHECK-NEXT:    sltu a0, a0, a6
+; CHECK-NEXT:    li a4, -1
+; CHECK-NEXT:    sub a5, a0, a2
+; CHECK-NEXT:    add a6, a0, a3
+; CHECK-NEXT:    sltu a0, a0, a5
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    addi a6, a6, -1
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    and a0, a0, a6
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsse8.v v8, (a6), a4
+; CHECK-NEXT:    sub a6, a6, a1
+; CHECK-NEXT:    and a0, a0, a5
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vsse8.v v16, (a4), a5
-; CHECK-NEXT:    add a1, a3, a1
-; CHECK-NEXT:    vle8.v v16, (a1)
-; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vsse8.v v16, (a6), a4
+; CHECK-NEXT:    vle8.v v16, (a2)
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a3)
 ; CHECK-NEXT:    addi sp, s0, -80
 ; CHECK-NEXT:    .cfi_def_cfa sp, 80

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask-fixed-vectors.ll b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask-fixed-vectors.ll
index 7f81b99eb0338d..09d92c3c039f92 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask-fixed-vectors.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask-fixed-vectors.ll
@@ -7,11 +7,11 @@ define <2 x i1> @test_vp_reverse_v2i1_masked(<2 x i1> %src, <2 x i1> %mask, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vid.v v10, v0.t
-; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vrsub.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v11, v9, v10, v0.t
@@ -27,9 +27,11 @@ define <2 x i1> @test_vp_reverse_v2i1(<2 x i1> %src, i32 zeroext %evl) {
 ; CHECK-NEXT:    addi a1, a0, -1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    vrgatherei16.vv v10, v9, v8
 ; CHECK-NEXT:    vmsne.vi v0, v10, 0
@@ -44,11 +46,11 @@ define <4 x i1> @test_vp_reverse_v4i1_masked(<4 x i1> %src, <4 x i1> %mask, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v10, v0.t
-; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vrsub.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v11, v9, v10, v0.t
@@ -64,9 +66,11 @@ define <4 x i1> @test_vp_reverse_v4i1(<4 x i1> %src, i32 zeroext %evl) {
 ; CHECK-NEXT:    addi a1, a0, -1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    vrgatherei16.vv v10, v9, v8
 ; CHECK-NEXT:    vmsne.vi v0, v10, 0
@@ -81,11 +85,11 @@ define <8 x i1> @test_vp_reverse_v8i1_masked(<8 x i1> %src, <8 x i1> %mask, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vid.v v10, v0.t
-; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vrsub.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v11, v9, v10, v0.t
@@ -101,9 +105,11 @@ define <8 x i1> @test_vp_reverse_v8i1(<8 x i1> %src, i32 zeroext %evl) {
 ; CHECK-NEXT:    addi a1, a0, -1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    vrgatherei16.vv v10, v9, v8
 ; CHECK-NEXT:    vmsne.vi v0, v10, 0
@@ -118,11 +124,11 @@ define <16 x i1> @test_vp_reverse_v16i1_masked(<16 x i1> %src, <16 x i1> %mask,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vid.v v10, v0.t
-; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vrsub.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v12, v9, v10, v0.t
@@ -138,9 +144,11 @@ define <16 x i1> @test_vp_reverse_v16i1(<16 x i1> %src, i32 zeroext %evl) {
 ; CHECK-NEXT:    addi a1, a0, -1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
 ; CHECK-NEXT:    vrgatherei16.vv v11, v10, v8
 ; CHECK-NEXT:    vmsne.vi v0, v11, 0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask.ll b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask.ll
index 8c1be2c1e9791e..8e44d76e7010f6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-reverse-mask.ll
@@ -6,11 +6,11 @@ define <vscale x 1 x i1> @test_vp_reverse_nxv1i1_masked(<vscale x 1 x i1> %src,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vid.v v10, v0.t
-; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vrsub.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v11, v9, v10, v0.t
@@ -26,9 +26,11 @@ define <vscale x 1 x i1> @test_vp_reverse_nxv1i1(<vscale x 1 x i1> %src, i32 zer
 ; CHECK-NEXT:    addi a1, a0, -1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
 ; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    vrgatherei16.vv v10, v9, v8
 ; CHECK-NEXT:    vmsne.vi v0, v10, 0
@@ -43,11 +45,11 @@ define <vscale x 2 x i1> @test_vp_reverse_nxv2i1_masked(<vscale x 2 x i1> %src,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v10, v0.t
-; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vrsub.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v11, v9, v10, v0.t
@@ -63,9 +65,11 @@ define <vscale x 2 x i1> @test_vp_reverse_nxv2i1(<vscale x 2 x i1> %src, i32 zer
 ; CHECK-NEXT:    addi a1, a0, -1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    vrgatherei16.vv v10, v9, v8
 ; CHECK-NEXT:    vmsne.vi v0, v10, 0
@@ -80,11 +84,11 @@ define <vscale x 4 x i1> @test_vp_reverse_nxv4i1_masked(<vscale x 4 x i1> %src,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vid.v v10, v0.t
-; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vrsub.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v11, v9, v10, v0.t
@@ -100,9 +104,11 @@ define <vscale x 4 x i1> @test_vp_reverse_nxv4i1(<vscale x 4 x i1> %src, i32 zer
 ; CHECK-NEXT:    addi a1, a0, -1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    vrgatherei16.vv v10, v9, v8
 ; CHECK-NEXT:    vmsne.vi v0, v10, 0
@@ -117,11 +123,11 @@ define <vscale x 8 x i1> @test_vp_reverse_nxv8i1_masked(<vscale x 8 x i1> %src,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
+; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vid.v v10, v0.t
-; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vrsub.vx v10, v10, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v12, v9, v10, v0.t
@@ -137,9 +143,11 @@ define <vscale x 8 x i1> @test_vp_reverse_nxv8i1(<vscale x 8 x i1> %src, i32 zer
 ; CHECK-NEXT:    addi a1, a0, -1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
 ; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
 ; CHECK-NEXT:    vrgatherei16.vv v11, v10, v8
 ; CHECK-NEXT:    vmsne.vi v0, v11, 0
@@ -154,11 +162,11 @@ define <vscale x 16 x i1> @test_vp_reverse_nxv16i1_masked(<vscale x 16 x i1> %sr
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vid.v v12, v0.t
-; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vrsub.vx v12, v12, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v16, v10, v12, v0.t
@@ -175,9 +183,11 @@ define <vscale x 16 x i1> @test_vp_reverse_nxv16i1(<vscale x 16 x i1> %src, i32
 ; CHECK-NEXT:    addi a1, a0, -1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
 ; CHECK-NEXT:    vrgatherei16.vv v14, v12, v8
 ; CHECK-NEXT:    vmsne.vi v0, v14, 0
@@ -192,11 +202,11 @@ define <vscale x 32 x i1> @test_vp_reverse_nxv32i1_masked(<vscale x 32 x i1> %sr
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vid.v v16, v0.t
-; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    vrsub.vx v16, v16, a0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vrgatherei16.vv v24, v12, v16, v0.t
@@ -213,9 +223,11 @@ define <vscale x 32 x i1> @test_vp_reverse_nxv32i1(<vscale x 32 x i1> %src, i32
 ; CHECK-NEXT:    addi a1, a0, -1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vrsub.vx v8, v8, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vrsub.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
 ; CHECK-NEXT:    vrgatherei16.vv v20, v16, v8
 ; CHECK-NEXT:    vmsne.vi v0, v20, 0
@@ -230,26 +242,28 @@ define <vscale x 64 x i1> @test_vp_reverse_nxv64i1_masked(<vscale x 64 x i1> %sr
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    vmerge.vim v24, v16, 1, v0
 ; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
 ; CHECK-NEXT:    addi a2, a1, -1
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vrsub.vx v10, v10, a2
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v23, v24, v10
-; CHECK-NEXT:    vrgatherei16.vv v22, v25, v10
-; CHECK-NEXT:    vrgatherei16.vv v21, v26, v10
-; CHECK-NEXT:    vrgatherei16.vv v20, v27, v10
-; CHECK-NEXT:    vrgatherei16.vv v19, v28, v10
-; CHECK-NEXT:    vrgatherei16.vv v18, v29, v10
-; CHECK-NEXT:    vrgatherei16.vv v17, v30, v10
-; CHECK-NEXT:    vrgatherei16.vv v16, v31, v10
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    vrgatherei16.vv v31, v16, v10
+; CHECK-NEXT:    vrgatherei16.vv v30, v17, v10
+; CHECK-NEXT:    vrgatherei16.vv v29, v18, v10
+; CHECK-NEXT:    vrgatherei16.vv v28, v19, v10
+; CHECK-NEXT:    vrgatherei16.vv v27, v20, v10
+; CHECK-NEXT:    vrgatherei16.vv v26, v21, v10
+; CHECK-NEXT:    vrgatherei16.vv v25, v22, v10
+; CHECK-NEXT:    vrgatherei16.vv v24, v23, v10
 ; CHECK-NEXT:    sub a1, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vslidedown.vx v16, v24, a1, v0.t
 ; CHECK-NEXT:    vmsne.vi v8, v16, 0, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    ret
@@ -262,25 +276,27 @@ define <vscale x 64 x i1> @test_vp_reverse_nxv64i1(<vscale x 64 x i1> %src, i32
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v8, 0
-; CHECK-NEXT:    vmerge.vim v16, v8, 1, v0
 ; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vid.v v16
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    addi a2, a1, -1
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vrsub.vx v24, v8, a2
+; CHECK-NEXT:    vrsub.vx v24, v16, a2
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v15, v16, v24
-; CHECK-NEXT:    vrgatherei16.vv v14, v17, v24
-; CHECK-NEXT:    vrgatherei16.vv v13, v18, v24
-; CHECK-NEXT:    vrgatherei16.vv v12, v19, v24
-; CHECK-NEXT:    vrgatherei16.vv v11, v20, v24
-; CHECK-NEXT:    vrgatherei16.vv v10, v21, v24
-; CHECK-NEXT:    vrgatherei16.vv v9, v22, v24
-; CHECK-NEXT:    vrgatherei16.vv v8, v23, v24
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    vrgatherei16.vv v23, v8, v24
+; CHECK-NEXT:    vrgatherei16.vv v22, v9, v24
+; CHECK-NEXT:    vrgatherei16.vv v21, v10, v24
+; CHECK-NEXT:    vrgatherei16.vv v20, v11, v24
+; CHECK-NEXT:    vrgatherei16.vv v19, v12, v24
+; CHECK-NEXT:    vrgatherei16.vv v18, v13, v24
+; CHECK-NEXT:    vrgatherei16.vv v17, v14, v24
+; CHECK-NEXT:    vrgatherei16.vv v16, v15, v24
 ; CHECK-NEXT:    sub a1, a1, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v8, a1
+; CHECK-NEXT:    vslidedown.vx v8, v16, a1
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
 ; CHECK-NEXT:    ret
 

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-fixed-vectors.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-fixed-vectors.ll
index 9496cd82947d4b..2a137099bcb0f4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-fixed-vectors.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-fixed-vectors.ll
@@ -11,10 +11,10 @@ define <2 x i1> @test_vp_splice_v2i1(<2 x i1> %va, <2 x i1> %vb, i32 zeroext %ev
 ; CHECK-LABEL: test_vp_splice_v2i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v9
@@ -35,10 +35,10 @@ define <2 x i1> @test_vp_splice_v2i1_negative_offset(<2 x i1> %va, <2 x i1> %vb,
 ; CHECK-LABEL: test_vp_splice_v2i1_negative_offset:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v9
@@ -59,10 +59,10 @@ define <2 x i1> @test_vp_splice_v2i1_masked(<2 x i1> %va, <2 x i1> %vb, <2 x i1>
 ; CHECK-LABEL: test_vp_splice_v2i1_masked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    vmv1r.v v0, v10
@@ -84,10 +84,10 @@ define <4 x i1> @test_vp_splice_v4i1(<4 x i1> %va, <4 x i1> %vb, i32 zeroext %ev
 ; CHECK-LABEL: test_vp_splice_v4i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v9
@@ -108,10 +108,10 @@ define <4 x i1> @test_vp_splice_v4i1_negative_offset(<4 x i1> %va, <4 x i1> %vb,
 ; CHECK-LABEL: test_vp_splice_v4i1_negative_offset:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v9
@@ -132,10 +132,10 @@ define <4 x i1> @test_vp_splice_v4i1_masked(<4 x i1> %va, <4 x i1> %vb, <4 x i1>
 ; CHECK-LABEL: test_vp_splice_v4i1_masked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    vmv1r.v v0, v10
@@ -157,10 +157,10 @@ define <8 x i1> @test_vp_splice_v8i1(<8 x i1> %va, <8 x i1> %vb, i32 zeroext %ev
 ; CHECK-LABEL: test_vp_splice_v8i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v9
@@ -181,10 +181,10 @@ define <8 x i1> @test_vp_splice_v8i1_negative_offset(<8 x i1> %va, <8 x i1> %vb,
 ; CHECK-LABEL: test_vp_splice_v8i1_negative_offset:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v9
@@ -205,10 +205,10 @@ define <8 x i1> @test_vp_splice_v8i1_masked(<8 x i1> %va, <8 x i1> %vb, <8 x i1>
 ; CHECK-LABEL: test_vp_splice_v8i1_masked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    vmv1r.v v0, v10
@@ -230,10 +230,10 @@ define <16 x i1> @test_vp_splice_v16i1(<16 x i1> %va, <16 x i1> %vb, i32 zeroext
 ; CHECK-LABEL: test_vp_splice_v16i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v9
@@ -254,10 +254,10 @@ define <16 x i1> @test_vp_splice_v16i1_negative_offset(<16 x i1> %va, <16 x i1>
 ; CHECK-LABEL: test_vp_splice_v16i1_negative_offset:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v9
@@ -278,10 +278,10 @@ define <16 x i1> @test_vp_splice_v16i1_masked(<16 x i1> %va, <16 x i1> %vb, <16
 ; CHECK-LABEL: test_vp_splice_v16i1_masked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    vmv1r.v v0, v10

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-vectors.ll b/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-vectors.ll
index 90276308252271..fc446d0a3a88ac 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-vectors.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-splice-mask-vectors.ll
@@ -14,10 +14,10 @@ define <vscale x 1 x i1> @test_vp_splice_nxv1i1(<vscale x 1 x i1> %va, <vscale x
 ; CHECK-LABEL: test_vp_splice_nxv1i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v9
@@ -38,10 +38,10 @@ define <vscale x 1 x i1> @test_vp_splice_nxv1i1_negative_offset(<vscale x 1 x i1
 ; CHECK-LABEL: test_vp_splice_nxv1i1_negative_offset:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v9
@@ -62,10 +62,10 @@ define <vscale x 1 x i1> @test_vp_splice_nxv1i1_masked(<vscale x 1 x i1> %va, <v
 ; CHECK-LABEL: test_vp_splice_nxv1i1_masked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    vmv1r.v v0, v10
@@ -87,10 +87,10 @@ define <vscale x 2 x i1> @test_vp_splice_nxv2i1(<vscale x 2 x i1> %va, <vscale x
 ; CHECK-LABEL: test_vp_splice_nxv2i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v9
@@ -111,10 +111,10 @@ define <vscale x 2 x i1> @test_vp_splice_nxv2i1_negative_offset(<vscale x 2 x i1
 ; CHECK-LABEL: test_vp_splice_nxv2i1_negative_offset:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v9
@@ -135,10 +135,10 @@ define <vscale x 2 x i1> @test_vp_splice_nxv2i1_masked(<vscale x 2 x i1> %va, <v
 ; CHECK-LABEL: test_vp_splice_nxv2i1_masked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    vmv1r.v v0, v10
@@ -160,10 +160,10 @@ define <vscale x 4 x i1> @test_vp_splice_nxv4i1(<vscale x 4 x i1> %va, <vscale x
 ; CHECK-LABEL: test_vp_splice_nxv4i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v9
@@ -184,10 +184,10 @@ define <vscale x 4 x i1> @test_vp_splice_nxv4i1_negative_offset(<vscale x 4 x i1
 ; CHECK-LABEL: test_vp_splice_nxv4i1_negative_offset:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v9
@@ -208,10 +208,10 @@ define <vscale x 4 x i1> @test_vp_splice_nxv4i1_masked(<vscale x 4 x i1> %va, <v
 ; CHECK-LABEL: test_vp_splice_nxv4i1_masked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    vmv1r.v v0, v10
@@ -233,10 +233,10 @@ define <vscale x 8 x i1> @test_vp_splice_nxv8i1(<vscale x 8 x i1> %va, <vscale x
 ; CHECK-LABEL: test_vp_splice_nxv8i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v9
@@ -257,10 +257,10 @@ define <vscale x 8 x i1> @test_vp_splice_nxv8i1_negative_offset(<vscale x 8 x i1
 ; CHECK-LABEL: test_vp_splice_nxv8i1_negative_offset:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    vmv1r.v v0, v9
@@ -281,10 +281,10 @@ define <vscale x 8 x i1> @test_vp_splice_nxv8i1_masked(<vscale x 8 x i1> %va, <v
 ; CHECK-LABEL: test_vp_splice_nxv8i1_masked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v11, 0
 ; CHECK-NEXT:    vmv1r.v v0, v10
@@ -306,9 +306,9 @@ define <vscale x 16 x i1> @test_vp_splice_nxv16i1(<vscale x 16 x i1> %va, <vscal
 ; CHECK-LABEL: test_vp_splice_nxv16i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v12, 0
@@ -330,9 +330,9 @@ define <vscale x 16 x i1> @test_vp_splice_nxv16i1_negative_offset(<vscale x 16 x
 ; CHECK-LABEL: test_vp_splice_nxv16i1_negative_offset:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v12, 0
@@ -354,9 +354,9 @@ define <vscale x 16 x i1> @test_vp_splice_nxv16i1_masked(<vscale x 16 x i1> %va,
 ; CHECK-LABEL: test_vp_splice_nxv16i1_masked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v14, 0
@@ -380,9 +380,9 @@ define <vscale x 32 x i1> @test_vp_splice_nxv32i1(<vscale x 32 x i1> %va, <vscal
 ; CHECK-LABEL: test_vp_splice_nxv32i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
@@ -404,9 +404,9 @@ define <vscale x 32 x i1> @test_vp_splice_nxv32i1_negative_offset(<vscale x 32 x
 ; CHECK-LABEL: test_vp_splice_nxv32i1_negative_offset:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
@@ -428,9 +428,9 @@ define <vscale x 32 x i1> @test_vp_splice_nxv32i1_masked(<vscale x 32 x i1> %va,
 ; CHECK-LABEL: test_vp_splice_nxv32i1_masked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v12, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
@@ -454,9 +454,9 @@ define <vscale x 64 x i1> @test_vp_splice_nxv64i1(<vscale x 64 x i1> %va, <vscal
 ; CHECK-LABEL: test_vp_splice_nxv64i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v24, 0
@@ -478,9 +478,9 @@ define <vscale x 64 x i1> @test_vp_splice_nxv64i1_negative_offset(<vscale x 64 x
 ; CHECK-LABEL: test_vp_splice_nxv64i1_negative_offset:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v24, 0
@@ -502,9 +502,9 @@ define <vscale x 64 x i1> @test_vp_splice_nxv64i1_masked(<vscale x 64 x i1> %va,
 ; CHECK-LABEL: test_vp_splice_nxv64i1_masked:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v24, 0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
index 34f8f35ee98c0c..3e423c8ec99030 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpgather-sdnode.ll
@@ -261,16 +261,16 @@ define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8
 ; RV32-NEXT:    vmv1r.v v12, v0
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a2, a3, 1
-; RV32-NEXT:    sub a4, a1, a2
-; RV32-NEXT:    sltu a5, a1, a4
-; RV32-NEXT:    addi a5, a5, -1
-; RV32-NEXT:    and a4, a5, a4
 ; RV32-NEXT:    srli a3, a3, 2
+; RV32-NEXT:    sub a4, a1, a2
 ; RV32-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a3
-; RV32-NEXT:    vsetvli a3, zero, e32, m8, ta, ma
+; RV32-NEXT:    sltu a3, a1, a4
+; RV32-NEXT:    addi a3, a3, -1
+; RV32-NEXT:    and a3, a3, a4
+; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vsext.vf4 v16, v10
-; RV32-NEXT:    vsetvli zero, a4, e8, m2, ta, ma
+; RV32-NEXT:    vsetvli zero, a3, e8, m2, ta, ma
 ; RV32-NEXT:    vluxei32.v v10, (a0), v16, v0.t
 ; RV32-NEXT:    bltu a1, a2, .LBB12_2
 ; RV32-NEXT:  # %bb.1:
@@ -288,17 +288,17 @@ define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8
 ; RV64-NEXT:    vmv1r.v v12, v0
 ; RV64-NEXT:    csrr a2, vlenb
 ; RV64-NEXT:    slli a3, a2, 1
-; RV64-NEXT:    sub a4, a1, a3
-; RV64-NEXT:    sltu a5, a1, a4
-; RV64-NEXT:    addi a5, a5, -1
-; RV64-NEXT:    and a5, a5, a4
+; RV64-NEXT:    srli a4, a2, 2
+; RV64-NEXT:    sub a5, a1, a3
+; RV64-NEXT:    vsetvli a6, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslidedown.vx v13, v0, a4
+; RV64-NEXT:    sltu a4, a1, a5
+; RV64-NEXT:    addi a4, a4, -1
+; RV64-NEXT:    and a5, a4, a5
 ; RV64-NEXT:    sub a4, a5, a2
 ; RV64-NEXT:    sltu a6, a5, a4
 ; RV64-NEXT:    addi a6, a6, -1
 ; RV64-NEXT:    and a6, a6, a4
-; RV64-NEXT:    srli a4, a2, 2
-; RV64-NEXT:    vsetvli a7, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vx v13, v0, a4
 ; RV64-NEXT:    srli a4, a2, 3
 ; RV64-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v13, a4
@@ -320,11 +320,11 @@ define <vscale x 32 x i8> @vpgather_baseidx_nxv32i8(ptr %base, <vscale x 32 x i8
 ; RV64-NEXT:    mv a1, a3
 ; RV64-NEXT:  .LBB12_4:
 ; RV64-NEXT:    sub a3, a1, a2
-; RV64-NEXT:    sltu a5, a1, a3
-; RV64-NEXT:    addi a5, a5, -1
-; RV64-NEXT:    and a3, a5, a3
 ; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v12, a4
+; RV64-NEXT:    sltu a4, a1, a3
+; RV64-NEXT:    addi a4, a4, -1
+; RV64-NEXT:    and a3, a4, a3
 ; RV64-NEXT:    vsetvli a4, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf8 v16, v9
 ; RV64-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
@@ -2460,11 +2460,11 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
 ; RV32-NEXT:    vmv1r.v v24, v0
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    sub a2, a0, a1
+; RV32-NEXT:    srli a3, a1, 3
+; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vx v0, v0, a3
 ; RV32-NEXT:    sltu a3, a0, a2
 ; RV32-NEXT:    addi a3, a3, -1
-; RV32-NEXT:    srli a4, a1, 3
-; RV32-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vx v0, v0, a4
 ; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vluxei32.v v16, (zero), v12, v0.t
@@ -2483,11 +2483,11 @@ define <vscale x 16 x double> @vpgather_nxv16f64(<vscale x 16 x ptr> %ptrs, <vsc
 ; RV64-NEXT:    vmv1r.v v24, v0
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    sub a2, a0, a1
+; RV64-NEXT:    srli a3, a1, 3
+; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vx v0, v0, a3
 ; RV64-NEXT:    sltu a3, a0, a2
 ; RV64-NEXT:    addi a3, a3, -1
-; RV64-NEXT:    srli a4, a1, 3
-; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v0, v0, a4
 ; RV64-NEXT:    and a2, a3, a2
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vluxei64.v v16, (zero), v16, v0.t
@@ -2509,8 +2509,8 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
 ; RV32-NEXT:    vmv1r.v v12, v0
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vsext.vf2 v16, v8
-; RV32-NEXT:    vsll.vi v24, v16, 3
 ; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    vsll.vi v24, v16, 3
 ; RV32-NEXT:    sub a3, a1, a2
 ; RV32-NEXT:    srli a4, a2, 3
 ; RV32-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
@@ -2534,8 +2534,8 @@ define <vscale x 16 x double> @vpgather_baseidx_nxv16i16_nxv16f64(ptr %base, <vs
 ; RV64-NEXT:    vmv1r.v v12, v0
 ; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf4 v16, v10
-; RV64-NEXT:    vsll.vi v16, v16, 3
 ; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    vsll.vi v16, v16, 3
 ; RV64-NEXT:    sub a3, a1, a2
 ; RV64-NEXT:    srli a4, a2, 3
 ; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
@@ -2567,8 +2567,8 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
 ; RV32-NEXT:    vmv1r.v v12, v0
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vsext.vf2 v16, v8
-; RV32-NEXT:    vsll.vi v24, v16, 3
 ; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    vsll.vi v24, v16, 3
 ; RV32-NEXT:    sub a3, a1, a2
 ; RV32-NEXT:    srli a4, a2, 3
 ; RV32-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
@@ -2592,8 +2592,8 @@ define <vscale x 16 x double> @vpgather_baseidx_sext_nxv16i16_nxv16f64(ptr %base
 ; RV64-NEXT:    vmv1r.v v12, v0
 ; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf4 v16, v10
-; RV64-NEXT:    vsll.vi v16, v16, 3
 ; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    vsll.vi v16, v16, 3
 ; RV64-NEXT:    sub a3, a1, a2
 ; RV64-NEXT:    srli a4, a2, 3
 ; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
@@ -2626,8 +2626,8 @@ define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base
 ; RV32-NEXT:    vmv1r.v v12, v0
 ; RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vzext.vf2 v16, v8
-; RV32-NEXT:    vsll.vi v24, v16, 3
 ; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    vsll.vi v24, v16, 3
 ; RV32-NEXT:    sub a3, a1, a2
 ; RV32-NEXT:    srli a4, a2, 3
 ; RV32-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma
@@ -2651,8 +2651,8 @@ define <vscale x 16 x double> @vpgather_baseidx_zext_nxv16i16_nxv16f64(ptr %base
 ; RV64-NEXT:    vmv1r.v v12, v0
 ; RV64-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; RV64-NEXT:    vzext.vf2 v16, v8
-; RV64-NEXT:    vsll.vi v24, v16, 3
 ; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    vsll.vi v24, v16, 3
 ; RV64-NEXT:    sub a3, a1, a2
 ; RV64-NEXT:    srli a4, a2, 3
 ; RV64-NEXT:    vsetvli a5, zero, e8, mf4, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vpload.ll b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
index 5683a7b7588546..8dfab72d008c2c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpload.ll
@@ -513,13 +513,13 @@ define <vscale x 16 x double> @vpload_nxv16f64(ptr %ptr, <vscale x 16 x i1> %m,
 ; CHECK-NEXT:    vmv1r.v v8, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    slli a4, a2, 3
 ; CHECK-NEXT:    srli a5, a2, 3
 ; CHECK-NEXT:    vsetvli a6, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a5
+; CHECK-NEXT:    sltu a5, a1, a3
+; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    and a3, a5, a3
 ; CHECK-NEXT:    add a4, a0, a4
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a4), v0.t
@@ -549,26 +549,26 @@ declare <vscale x 16 x double> @llvm.vector.extract.nxv16f64(<vscale x 17 x doub
 define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpload_nxv17f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v8, v0
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a5, a3, 1
-; CHECK-NEXT:    vmv1r.v v8, v0
 ; CHECK-NEXT:    mv a4, a2
 ; CHECK-NEXT:    bltu a2, a5, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a4, a5
 ; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    sub a6, a4, a3
-; CHECK-NEXT:    sltu a7, a4, a6
-; CHECK-NEXT:    addi a7, a7, -1
-; CHECK-NEXT:    and a6, a7, a6
 ; CHECK-NEXT:    slli a7, a3, 3
 ; CHECK-NEXT:    srli t0, a3, 3
+; CHECK-NEXT:    sub a5, a2, a5
 ; CHECK-NEXT:    vsetvli t1, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, t0
+; CHECK-NEXT:    sltu t0, a4, a6
 ; CHECK-NEXT:    add a7, a0, a7
+; CHECK-NEXT:    addi t0, t0, -1
+; CHECK-NEXT:    and a6, t0, a6
 ; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a7), v0.t
-; CHECK-NEXT:    sub a5, a2, a5
 ; CHECK-NEXT:    sltu a2, a2, a5
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a5

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
index 3e3276788cd539..f029d0b1b01bc0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
@@ -366,16 +366,16 @@ define <vscale x 128 x i8> @vpmerge_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a4, a0, a1
-; CHECK-NEXT:    vl8r.v v16, (a4)
 ; CHECK-NEXT:    vsetvli a4, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a2)
-; CHECK-NEXT:    sub a2, a3, a1
-; CHECK-NEXT:    sltu a4, a3, a2
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    sub a4, a3, a1
+; CHECK-NEXT:    vl8r.v v16, (a2)
+; CHECK-NEXT:    sltu a2, a3, a4
 ; CHECK-NEXT:    vl8r.v v8, (a0)
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    and a2, a4, a2
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a4
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    bltu a3, a1, .LBB28_2

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
index 35cffc0eb3718f..6f74d8cd7e01a8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpscatter-sdnode.ll
@@ -2305,9 +2305,9 @@ define void @vpscatter_nxv16f64(<vscale x 16 x double> %val, <vscale x 16 x ptr>
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (zero), v24, v0.t
 ; RV32-NEXT:    sub a2, a1, a0
+; RV32-NEXT:    srli a0, a0, 3
 ; RV32-NEXT:    sltu a1, a1, a2
 ; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    srli a0, a0, 3
 ; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a0
 ; RV32-NEXT:    and a1, a1, a2
@@ -2323,12 +2323,12 @@ define void @vpscatter_nxv16f64(<vscale x 16 x double> %val, <vscale x 16 x ptr>
 ; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    sub sp, sp, a1
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    slli a3, a1, 3
 ; RV64-NEXT:    add a3, a0, a3
-; RV64-NEXT:    vl8re64.v v24, (a3)
-; RV64-NEXT:    addi a3, sp, 16
-; RV64-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vl8re64.v v16, (a3)
 ; RV64-NEXT:    vl8re64.v v24, (a0)
 ; RV64-NEXT:    mv a0, a2
 ; RV64-NEXT:    bltu a2, a1, .LBB108_2
@@ -2338,16 +2338,16 @@ define void @vpscatter_nxv16f64(<vscale x 16 x double> %val, <vscale x 16 x ptr>
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (zero), v24, v0.t
 ; RV64-NEXT:    sub a0, a2, a1
+; RV64-NEXT:    srli a1, a1, 3
 ; RV64-NEXT:    sltu a2, a2, a0
 ; RV64-NEXT:    addi a2, a2, -1
-; RV64-NEXT:    srli a1, a1, 3
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    and a0, a2, a0
 ; RV64-NEXT:    addi a1, sp, 16
 ; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v16, (zero), v8, v0.t
+; RV64-NEXT:    vsoxei64.v v8, (zero), v16, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 3
 ; RV64-NEXT:    add sp, sp, a0
@@ -2362,27 +2362,41 @@ define void @vpscatter_nxv16f64(<vscale x 16 x double> %val, <vscale x 16 x ptr>
 define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vl4re16.v v4, (a1)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    sub sp, sp, a3
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs1r.v v0, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vl4re16.v v24, (a1)
 ; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vsext.vf2 v24, v4
+; RV32-NEXT:    vsext.vf2 v0, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    vsll.vi v24, v24, 3
+; RV32-NEXT:    vsll.vi v24, v0, 3
 ; RV32-NEXT:    mv a3, a2
 ; RV32-NEXT:    bltu a2, a1, .LBB109_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a3, a1
 ; RV32-NEXT:  .LBB109_2:
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vl1r.v v0, (a4) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a2, a1
+; RV32-NEXT:    srli a1, a1, 3
 ; RV32-NEXT:    sltu a2, a2, a3
 ; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    srli a1, a1, 3
 ; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a1
 ; RV32-NEXT:    and a2, a2, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v16, (a0), v28, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    .cfi_def_cfa sp, 16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_nxv16i16_nxv16f64:
@@ -2393,19 +2407,19 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
 ; RV64-NEXT:    slli a3, a3, 4
 ; RV64-NEXT:    sub sp, sp, a3
 ; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 3
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    vl4re16.v v24, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf4 v16, v26
 ; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsext.vf4 v16, v24
-; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    vsll.vi v24, v16, 3
 ; RV64-NEXT:    mv a3, a2
 ; RV64-NEXT:    bltu a2, a1, .LBB109_2
@@ -2415,9 +2429,9 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    sub a3, a2, a1
+; RV64-NEXT:    srli a1, a1, 3
 ; RV64-NEXT:    sltu a2, a2, a3
 ; RV64-NEXT:    addi a2, a2, -1
-; RV64-NEXT:    srli a1, a1, 3
 ; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    and a2, a2, a3
@@ -2445,27 +2459,41 @@ define void @vpscatter_baseidx_nxv16i16_nxv16f64(<vscale x 16 x double> %val, pt
 define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %val, ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_sext_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vl4re16.v v4, (a1)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    sub sp, sp, a3
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs1r.v v0, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vl4re16.v v24, (a1)
 ; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vsext.vf2 v24, v4
+; RV32-NEXT:    vsext.vf2 v0, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    vsll.vi v24, v24, 3
+; RV32-NEXT:    vsll.vi v24, v0, 3
 ; RV32-NEXT:    mv a3, a2
 ; RV32-NEXT:    bltu a2, a1, .LBB110_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a3, a1
 ; RV32-NEXT:  .LBB110_2:
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vl1r.v v0, (a4) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a2, a1
+; RV32-NEXT:    srli a1, a1, 3
 ; RV32-NEXT:    sltu a2, a2, a3
 ; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    srli a1, a1, 3
 ; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a1
 ; RV32-NEXT:    and a2, a2, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v16, (a0), v28, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    .cfi_def_cfa sp, 16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_sext_nxv16i16_nxv16f64:
@@ -2473,48 +2501,48 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a4, a3, 3
-; RV64-NEXT:    add a3, a4, a3
+; RV64-NEXT:    slli a3, a3, 4
 ; RV64-NEXT:    sub sp, sp, a3
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    slli a3, a3, 3
+; RV64-NEXT:    add a3, sp, a3
+; RV64-NEXT:    addi a3, a3, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    vl4re16.v v24, (a1)
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsext.vf4 v0, v24
+; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vsext.vf4 v16, v26
 ; RV64-NEXT:    vsll.vi v16, v16, 3
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    vsll.vi v24, v0, 3
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vsext.vf4 v16, v24
+; RV64-NEXT:    vsll.vi v24, v16, 3
 ; RV64-NEXT:    mv a3, a2
 ; RV64-NEXT:    bltu a2, a1, .LBB110_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a3, a1
 ; RV64-NEXT:  .LBB110_2:
-; RV64-NEXT:    addi a4, sp, 16
-; RV64-NEXT:    vl1r.v v0, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    sub a3, a2, a1
+; RV64-NEXT:    srli a1, a1, 3
 ; RV64-NEXT:    sltu a2, a2, a3
 ; RV64-NEXT:    addi a2, a2, -1
-; RV64-NEXT:    srli a1, a1, 3
 ; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    and a2, a2, a3
 ; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a1, a0, 3
-; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    slli a0, a0, 4
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
 ; RV64-NEXT:    addi sp, sp, 16
@@ -2529,52 +2557,80 @@ define void @vpscatter_baseidx_sext_nxv16i16_nxv16f64(<vscale x 16 x double> %va
 define void @vpscatter_baseidx_zext_nxv16i16_nxv16f64(<vscale x 16 x double> %val, ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; RV32-LABEL: vpscatter_baseidx_zext_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vl4re16.v v4, (a1)
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    sub sp, sp, a3
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; RV32-NEXT:    addi a3, sp, 16
+; RV32-NEXT:    vs1r.v v0, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vl4re16.v v24, (a1)
 ; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vzext.vf2 v24, v4
+; RV32-NEXT:    vzext.vf2 v0, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    vsll.vi v24, v24, 3
+; RV32-NEXT:    vsll.vi v24, v0, 3
 ; RV32-NEXT:    mv a3, a2
 ; RV32-NEXT:    bltu a2, a1, .LBB111_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a3, a1
 ; RV32-NEXT:  .LBB111_2:
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vl1r.v v0, (a4) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    sub a3, a2, a1
+; RV32-NEXT:    srli a1, a1, 3
 ; RV32-NEXT:    sltu a2, a2, a3
 ; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    srli a1, a1, 3
 ; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vslidedown.vx v0, v0, a1
 ; RV32-NEXT:    and a2, a2, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsoxei32.v v16, (a0), v28, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    .cfi_def_cfa sp, 16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vpscatter_baseidx_zext_nxv16i16_nxv16f64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vl4re16.v v4, (a1)
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a3, vlenb
+; RV64-NEXT:    sub sp, sp, a3
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; RV64-NEXT:    addi a3, sp, 16
+; RV64-NEXT:    vs1r.v v0, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vl4re16.v v24, (a1)
 ; RV64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV64-NEXT:    vzext.vf2 v24, v4
+; RV64-NEXT:    vzext.vf2 v0, v24
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    vsll.vi v24, v24, 3
+; RV64-NEXT:    vsll.vi v24, v0, 3
 ; RV64-NEXT:    mv a3, a2
 ; RV64-NEXT:    bltu a2, a1, .LBB111_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a3, a1
 ; RV64-NEXT:  .LBB111_2:
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vl1r.v v0, (a4) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV64-NEXT:    sub a3, a2, a1
+; RV64-NEXT:    srli a1, a1, 3
 ; RV64-NEXT:    sltu a2, a2, a3
 ; RV64-NEXT:    addi a2, a2, -1
-; RV64-NEXT:    srli a1, a1, 3
 ; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vslidedown.vx v0, v0, a1
 ; RV64-NEXT:    and a2, a2, a3
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV64-NEXT:    vsoxei32.v v16, (a0), v28, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %eidxs = zext <vscale x 16 x i16> %idxs to <vscale x 16 x i64>
   %ptrs = getelementptr inbounds double, ptr %base, <vscale x 16 x i64> %eidxs

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
index 6d8574c11d1dd6..549f57a01f38f9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
@@ -427,15 +427,15 @@ define void @vpstore_nxv16f64(<vscale x 16 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a0), v0.t
+; CHECK-NEXT:    srli a3, a2, 3
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    sub a3, a1, a2
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sltu a1, a1, a3
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a1, a1, a3
-; CHECK-NEXT:    slli a3, a2, 3
-; CHECK-NEXT:    srli a2, a2, 3
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    add a0, a0, a3
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v16, (a0), v0.t
 ; CHECK-NEXT:    ret
@@ -450,9 +450,17 @@ declare void @llvm.vp.store.nxv17f64.p0(<vscale x 17 x double>, ptr, <vscale x 1
 define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 17 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpstore_nxv17f64:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a4, a3, 1
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    sub sp, sp, a3
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a4, a3, 1
 ; CHECK-NEXT:    mv a5, a2
 ; CHECK-NEXT:    bltu a2, a4, .LBB35_2
 ; CHECK-NEXT:  # %bb.1:
@@ -463,33 +471,27 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a6, a3
 ; CHECK-NEXT:  .LBB35_4:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a7, vlenb
-; CHECK-NEXT:    slli a7, a7, 3
-; CHECK-NEXT:    sub sp, sp, a7
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vl8re64.v v0, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vl8re64.v v16, (a0)
 ; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a1), v0.t
 ; CHECK-NEXT:    sub a0, a5, a3
+; CHECK-NEXT:    srli a6, a3, 3
+; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v24, a6
+; CHECK-NEXT:    slli a6, a3, 3
+; CHECK-NEXT:    sub a4, a2, a4
 ; CHECK-NEXT:    sltu a5, a5, a0
+; CHECK-NEXT:    add a6, a1, a6
+; CHECK-NEXT:    sltu a2, a2, a4
 ; CHECK-NEXT:    addi a5, a5, -1
-; CHECK-NEXT:    and a5, a5, a0
-; CHECK-NEXT:    slli a0, a3, 3
-; CHECK-NEXT:    add a6, a1, a0
-; CHECK-NEXT:    srli a0, a3, 3
-; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a0
-; CHECK-NEXT:    sub a0, a2, a4
-; CHECK-NEXT:    sltu a2, a2, a0
 ; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a0, a2, a0
+; CHECK-NEXT:    and a5, a5, a0
+; CHECK-NEXT:    and a0, a2, a4
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
-; CHECK-NEXT:    vse64.v v16, (a6), v0.t
+; CHECK-NEXT:    vse64.v v8, (a6), v0.t
 ; CHECK-NEXT:    bltu a0, a3, .LBB35_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a0, a3
@@ -499,10 +501,8 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v24, a3
 ; CHECK-NEXT:    add a1, a1, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vse64.v v8, (a1), v0.t
+; CHECK-NEXT:    vse64.v v16, (a1), v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
index c2ad7e76a26c75..a78130e8f102fc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
@@ -925,8 +925,8 @@ define half @vreduce_ord_fadd_nxv10f16(<vscale x 10 x half> %v, half %s) {
 ; CHECK-LABEL: vreduce_ord_fadd_nxv10f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    li a1, 10
+; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v12, fa0
@@ -963,11 +963,11 @@ define half @vreduce_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
 ; CHECK-LABEL: vreduce_fadd_nxv3f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    slli a1, a0, 1
 ; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    lui a1, 1048568
 ; CHECK-NEXT:    vmv.s.x v10, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -982,11 +982,11 @@ define half @vreduce_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-LABEL: vreduce_fadd_nxv6f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vfmv.s.f v10, fa0
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a0, a0, a1
-; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vfmv.s.f v10, fa0
 ; CHECK-NEXT:    lui a1, 1048568
 ; CHECK-NEXT:    vmv.s.x v11, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
index 13d1ac5088479f..a20f88c6977165 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
@@ -95,9 +95,9 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    srli a1, a2, 1
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v24, v0, a1
-; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    sub a1, a0, a2
 ; CHECK-NEXT:    sltu a3, a0, a1
 ; CHECK-NEXT:    addi a3, a3, -1
@@ -124,9 +124,9 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    srli a1, a2, 1
+; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v24, v0, a1
-; CHECK-NEXT:    slli a2, a2, 2
 ; CHECK-NEXT:    sub a1, a0, a2
 ; CHECK-NEXT:    sltu a3, a0, a1
 ; CHECK-NEXT:    addi a3, a3, -1
@@ -349,11 +349,11 @@ define float @vreduce_fminimum_nxv4f32(float %start, <vscale x 4 x float> %val,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
+; CHECK-NEXT:    feq.s a1, fa0, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfredmin.vs v10, v8, v10, v0.t
 ; CHECK-NEXT:    vmfne.vv v11, v8, v8, v0.t
 ; CHECK-NEXT:    vcpop.m a0, v11, v0.t
-; CHECK-NEXT:    feq.s a1, fa0, fa0
 ; CHECK-NEXT:    xori a1, a1, 1
 ; CHECK-NEXT:    or a0, a0, a1
 ; CHECK-NEXT:    beqz a0, .LBB22_2
@@ -373,11 +373,11 @@ define float @vreduce_fmaximum_nxv4f32(float %start, <vscale x 4 x float> %val,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
+; CHECK-NEXT:    feq.s a1, fa0, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; CHECK-NEXT:    vfredmax.vs v10, v8, v10, v0.t
 ; CHECK-NEXT:    vmfne.vv v11, v8, v8, v0.t
 ; CHECK-NEXT:    vcpop.m a0, v11, v0.t
-; CHECK-NEXT:    feq.s a1, fa0, fa0
 ; CHECK-NEXT:    xori a1, a1, 1
 ; CHECK-NEXT:    or a0, a0, a1
 ; CHECK-NEXT:    beqz a0, .LBB23_2
@@ -423,11 +423,11 @@ define float @vreduce_fminimum_v4f32(float %start, <4 x float> %val, <4 x i1> %m
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
+; CHECK-NEXT:    feq.s a1, fa0, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfredmin.vs v9, v8, v9, v0.t
 ; CHECK-NEXT:    vmfne.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vcpop.m a0, v8, v0.t
-; CHECK-NEXT:    feq.s a1, fa0, fa0
 ; CHECK-NEXT:    xori a1, a1, 1
 ; CHECK-NEXT:    or a0, a0, a1
 ; CHECK-NEXT:    beqz a0, .LBB26_2
@@ -447,11 +447,11 @@ define float @vreduce_fmaximum_v4f32(float %start, <4 x float> %val, <4 x i1> %m
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
+; CHECK-NEXT:    feq.s a1, fa0, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
 ; CHECK-NEXT:    vfredmax.vs v9, v8, v9, v0.t
 ; CHECK-NEXT:    vmfne.vv v8, v8, v8, v0.t
 ; CHECK-NEXT:    vcpop.m a0, v8, v0.t
-; CHECK-NEXT:    feq.s a1, fa0, fa0
 ; CHECK-NEXT:    xori a1, a1, 1
 ; CHECK-NEXT:    or a0, a0, a1
 ; CHECK-NEXT:    beqz a0, .LBB27_2

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
index 6222d7f4ae77f9..eacfce098bddb7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
@@ -1100,9 +1100,9 @@ define signext i32 @vpreduce_umax_nxv32i32(i32 signext %s, <vscale x 32 x i32> %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    srli a2, a3, 2
+; CHECK-NEXT:    slli a3, a3, 1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v24, v0, a2
-; CHECK-NEXT:    slli a3, a3, 1
 ; CHECK-NEXT:    sub a2, a1, a3
 ; CHECK-NEXT:    sltu a4, a1, a2
 ; CHECK-NEXT:    addi a4, a4, -1
@@ -1226,10 +1226,10 @@ define signext i64 @vpreduce_add_nxv1i64(i64 signext %s, <vscale x 1 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredsum.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1259,11 +1259,11 @@ define signext i64 @vpwreduce_add_nxv1i32(i64 signext %s, <vscale x 1 x i32> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, mf2, ta, ma
 ; RV32-NEXT:    vwredsum.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1294,11 +1294,11 @@ define signext i64 @vpwreduce_uadd_nxv1i32(i64 signext %s, <vscale x 1 x i32> %v
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, mf2, ta, ma
 ; RV32-NEXT:    vwredsum.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1331,10 +1331,10 @@ define signext i64 @vpreduce_umax_nxv1i64(i64 signext %s, <vscale x 1 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredmaxu.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1366,10 +1366,10 @@ define signext i64 @vpreduce_smax_nxv1i64(i64 signext %s, <vscale x 1 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredmax.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1401,10 +1401,10 @@ define signext i64 @vpreduce_umin_nxv1i64(i64 signext %s, <vscale x 1 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredminu.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1436,10 +1436,10 @@ define signext i64 @vpreduce_smin_nxv1i64(i64 signext %s, <vscale x 1 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredmin.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1471,10 +1471,10 @@ define signext i64 @vpreduce_and_nxv1i64(i64 signext %s, <vscale x 1 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredand.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1506,10 +1506,10 @@ define signext i64 @vpreduce_or_nxv1i64(i64 signext %s, <vscale x 1 x i64> %v, <
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredor.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1541,10 +1541,10 @@ define signext i64 @vpreduce_xor_nxv1i64(i64 signext %s, <vscale x 1 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
 ; RV32-NEXT:    vredxor.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1576,10 +1576,10 @@ define signext i64 @vpreduce_add_nxv2i64(i64 signext %s, <vscale x 2 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredsum.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1609,11 +1609,11 @@ define signext i64 @vwpreduce_add_nxv2i32(i64 signext %s, <vscale x 2 x i32> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
 ; RV32-NEXT:    vwredsum.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1644,11 +1644,11 @@ define signext i64 @vwpreduce_uadd_nxv2i32(i64 signext %s, <vscale x 2 x i32> %v
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
 ; RV32-NEXT:    vwredsum.vs v9, v8, v9, v0.t
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v9
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v9, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1681,10 +1681,10 @@ define signext i64 @vpreduce_umax_nxv2i64(i64 signext %s, <vscale x 2 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredmaxu.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1716,10 +1716,10 @@ define signext i64 @vpreduce_smax_nxv2i64(i64 signext %s, <vscale x 2 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredmax.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1751,10 +1751,10 @@ define signext i64 @vpreduce_umin_nxv2i64(i64 signext %s, <vscale x 2 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredminu.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1786,10 +1786,10 @@ define signext i64 @vpreduce_smin_nxv2i64(i64 signext %s, <vscale x 2 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredmin.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1821,10 +1821,10 @@ define signext i64 @vpreduce_and_nxv2i64(i64 signext %s, <vscale x 2 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredand.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1856,10 +1856,10 @@ define signext i64 @vpreduce_or_nxv2i64(i64 signext %s, <vscale x 2 x i64> %v, <
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredor.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1891,10 +1891,10 @@ define signext i64 @vpreduce_xor_nxv2i64(i64 signext %s, <vscale x 2 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
 ; RV32-NEXT:    vredxor.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1926,10 +1926,10 @@ define signext i64 @vpreduce_add_nxv4i64(i64 signext %s, <vscale x 4 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vredsum.vs v12, v8, v12, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v12, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1959,11 +1959,11 @@ define signext i64 @vpwreduce_add_nxv4i32(i64 signext %s, <vscale x 4 x i32> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
 ; RV32-NEXT:    vwredsum.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -1994,11 +1994,11 @@ define signext i64 @vpwreduce_uadd_nxv4i32(i64 signext %s, <vscale x 4 x i32> %v
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
 ; RV32-NEXT:    vwredsumu.vs v10, v8, v10, v0.t
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v10
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    addi sp, sp, 16
@@ -2031,10 +2031,10 @@ define signext i64 @vpreduce_umax_nxv4i64(i64 signext %s, <vscale x 4 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vredmaxu.vs v12, v8, v12, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v12, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2066,10 +2066,10 @@ define signext i64 @vpreduce_smax_nxv4i64(i64 signext %s, <vscale x 4 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vredmax.vs v12, v8, v12, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v12, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2101,10 +2101,10 @@ define signext i64 @vpreduce_umin_nxv4i64(i64 signext %s, <vscale x 4 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vredminu.vs v12, v8, v12, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v12, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2136,10 +2136,10 @@ define signext i64 @vpreduce_smin_nxv4i64(i64 signext %s, <vscale x 4 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vredmin.vs v12, v8, v12, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v12, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2171,10 +2171,10 @@ define signext i64 @vpreduce_and_nxv4i64(i64 signext %s, <vscale x 4 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vredand.vs v12, v8, v12, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v12, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2206,10 +2206,10 @@ define signext i64 @vpreduce_or_nxv4i64(i64 signext %s, <vscale x 4 x i64> %v, <
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vredor.vs v12, v8, v12, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v12, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -2241,10 +2241,10 @@ define signext i64 @vpreduce_xor_nxv4i64(i64 signext %s, <vscale x 4 x i64> %v,
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
 ; RV32-NEXT:    vredxor.vs v12, v8, v12, v0.t
 ; RV32-NEXT:    vmv.x.s a0, v12
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v12, a1
 ; RV32-NEXT:    vmv.x.s a1, v8

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll
index bcab7d05e698ee..fac5e31ecf94e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int.ll
@@ -1145,9 +1145,9 @@ define i64 @vreduce_add_nxv1i64(<vscale x 1 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredsum.vs v8, v8, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1169,11 +1169,11 @@ define i64 @vwreduce_add_nxv1i32(<vscale x 1 x i32> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    vwredsum.vs v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1197,11 +1197,11 @@ define i64 @vwreduce_uadd_nxv1i32(<vscale x 1 x i32> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
 ; RV32-NEXT:    vwredsumu.vs v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1371,9 +1371,9 @@ define i64 @vreduce_xor_nxv1i64(<vscale x 1 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredxor.vs v8, v8, v9
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1397,9 +1397,9 @@ define i64 @vreduce_add_nxv2i64(<vscale x 2 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vmv.s.x v10, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredsum.vs v8, v8, v10
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1421,11 +1421,11 @@ define i64 @vwreduce_add_nxv2i32(<vscale x 2 x i32> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vwredsum.vs v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1449,11 +1449,11 @@ define i64 @vwreduce_uadd_nxv2i32(<vscale x 2 x i32> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.s.x v9, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vwredsumu.vs v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1479,8 +1479,8 @@ define i64 @vreduce_umax_nxv2i64(<vscale x 2 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
-; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1503,8 +1503,8 @@ define i64 @vreduce_smax_nxv2i64(<vscale x 2 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vredmax.vs v8, v8, v8
-; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1527,8 +1527,8 @@ define i64 @vreduce_umin_nxv2i64(<vscale x 2 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vredminu.vs v8, v8, v8
-; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1551,8 +1551,8 @@ define i64 @vreduce_smin_nxv2i64(<vscale x 2 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vredmin.vs v8, v8, v8
-; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1575,8 +1575,8 @@ define i64 @vreduce_and_nxv2i64(<vscale x 2 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vredand.vs v8, v8, v8
-; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1599,8 +1599,8 @@ define i64 @vreduce_or_nxv2i64(<vscale x 2 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vredor.vs v8, v8, v8
-; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1623,9 +1623,9 @@ define i64 @vreduce_xor_nxv2i64(<vscale x 2 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
 ; RV32-NEXT:    vmv.s.x v10, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredxor.vs v8, v8, v10
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1649,9 +1649,9 @@ define i64 @vreduce_add_nxv4i64(<vscale x 4 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vmv.s.x v12, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredsum.vs v8, v8, v12
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1673,11 +1673,11 @@ define i64 @vwreduce_add_nxv4i32(<vscale x 4 x i32> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.s.x v10, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vwredsum.vs v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1701,11 +1701,11 @@ define i64 @vwreduce_uadd_nxv4i32(<vscale x 4 x i32> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.s.x v10, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vwredsumu.vs v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    ret
@@ -1731,8 +1731,8 @@ define i64 @vreduce_umax_nxv4i64(<vscale x 4 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
-; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1755,8 +1755,8 @@ define i64 @vreduce_smax_nxv4i64(<vscale x 4 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vredmax.vs v8, v8, v8
-; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1779,8 +1779,8 @@ define i64 @vreduce_umin_nxv4i64(<vscale x 4 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vredminu.vs v8, v8, v8
-; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1803,8 +1803,8 @@ define i64 @vreduce_smin_nxv4i64(<vscale x 4 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vredmin.vs v8, v8, v8
-; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1827,8 +1827,8 @@ define i64 @vreduce_and_nxv4i64(<vscale x 4 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vredand.vs v8, v8, v8
-; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1851,8 +1851,8 @@ define i64 @vreduce_or_nxv4i64(<vscale x 4 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vredor.vs v8, v8, v8
-; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vmv.x.s a0, v8
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8
@@ -1875,9 +1875,9 @@ define i64 @vreduce_xor_nxv4i64(<vscale x 4 x i64> %v) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
 ; RV32-NEXT:    vmv.s.x v12, zero
+; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vredxor.vs v8, v8, v12
 ; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 32
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vsrl.vx v8, v8, a1
 ; RV32-NEXT:    vmv.x.s a1, v8

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll
index 509bad44bb3218..9c21a626478e30 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrem-sdnode.ll
@@ -966,16 +966,16 @@ define <vscale x 1 x i64> @vrem_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
 ; RV32-V-NEXT:    addi sp, sp, -16
 ; RV32-V-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-V-NEXT:    lui a0, 748983
-; RV32-V-NEXT:    addi a0, a0, -586
 ; RV32-V-NEXT:    lui a1, 898779
+; RV32-V-NEXT:    addi a0, a0, -586
 ; RV32-V-NEXT:    addi a1, a1, 1755
 ; RV32-V-NEXT:    sw a1, 8(sp)
 ; RV32-V-NEXT:    sw a0, 12(sp)
 ; RV32-V-NEXT:    addi a0, sp, 8
 ; RV32-V-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32-V-NEXT:    vlse64.v v9, (a0), zero
-; RV32-V-NEXT:    vmulh.vv v9, v8, v9
 ; RV32-V-NEXT:    li a0, 63
+; RV32-V-NEXT:    vmulh.vv v9, v8, v9
 ; RV32-V-NEXT:    vsrl.vx v10, v9, a0
 ; RV32-V-NEXT:    vsra.vi v9, v9, 1
 ; RV32-V-NEXT:    vadd.vv v9, v9, v10
@@ -1051,16 +1051,16 @@ define <vscale x 2 x i64> @vrem_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
 ; RV32-V-NEXT:    addi sp, sp, -16
 ; RV32-V-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-V-NEXT:    lui a0, 748983
-; RV32-V-NEXT:    addi a0, a0, -586
 ; RV32-V-NEXT:    lui a1, 898779
+; RV32-V-NEXT:    addi a0, a0, -586
 ; RV32-V-NEXT:    addi a1, a1, 1755
 ; RV32-V-NEXT:    sw a1, 8(sp)
 ; RV32-V-NEXT:    sw a0, 12(sp)
 ; RV32-V-NEXT:    addi a0, sp, 8
 ; RV32-V-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32-V-NEXT:    vlse64.v v10, (a0), zero
-; RV32-V-NEXT:    vmulh.vv v10, v8, v10
 ; RV32-V-NEXT:    li a0, 63
+; RV32-V-NEXT:    vmulh.vv v10, v8, v10
 ; RV32-V-NEXT:    vsrl.vx v12, v10, a0
 ; RV32-V-NEXT:    vsra.vi v10, v10, 1
 ; RV32-V-NEXT:    vadd.vv v10, v10, v12
@@ -1136,16 +1136,16 @@ define <vscale x 4 x i64> @vrem_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
 ; RV32-V-NEXT:    addi sp, sp, -16
 ; RV32-V-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-V-NEXT:    lui a0, 748983
-; RV32-V-NEXT:    addi a0, a0, -586
 ; RV32-V-NEXT:    lui a1, 898779
+; RV32-V-NEXT:    addi a0, a0, -586
 ; RV32-V-NEXT:    addi a1, a1, 1755
 ; RV32-V-NEXT:    sw a1, 8(sp)
 ; RV32-V-NEXT:    sw a0, 12(sp)
 ; RV32-V-NEXT:    addi a0, sp, 8
 ; RV32-V-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32-V-NEXT:    vlse64.v v12, (a0), zero
-; RV32-V-NEXT:    vmulh.vv v12, v8, v12
 ; RV32-V-NEXT:    li a0, 63
+; RV32-V-NEXT:    vmulh.vv v12, v8, v12
 ; RV32-V-NEXT:    vsrl.vx v16, v12, a0
 ; RV32-V-NEXT:    vsra.vi v12, v12, 1
 ; RV32-V-NEXT:    vadd.vv v12, v12, v16
@@ -1221,16 +1221,16 @@ define <vscale x 8 x i64> @vrem_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
 ; RV32-V-NEXT:    addi sp, sp, -16
 ; RV32-V-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-V-NEXT:    lui a0, 748983
-; RV32-V-NEXT:    addi a0, a0, -586
 ; RV32-V-NEXT:    lui a1, 898779
+; RV32-V-NEXT:    addi a0, a0, -586
 ; RV32-V-NEXT:    addi a1, a1, 1755
 ; RV32-V-NEXT:    sw a1, 8(sp)
 ; RV32-V-NEXT:    sw a0, 12(sp)
 ; RV32-V-NEXT:    addi a0, sp, 8
 ; RV32-V-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-V-NEXT:    vlse64.v v16, (a0), zero
-; RV32-V-NEXT:    vmulh.vv v16, v8, v16
 ; RV32-V-NEXT:    li a0, 63
+; RV32-V-NEXT:    vmulh.vv v16, v8, v16
 ; RV32-V-NEXT:    vsrl.vx v24, v16, a0
 ; RV32-V-NEXT:    vsra.vi v16, v16, 1
 ; RV32-V-NEXT:    vadd.vv v16, v16, v24

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
index 95d8533bc53b25..ba6d95c5a43b7b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
@@ -11,10 +11,10 @@ define <vscale x 8 x i7> @vrem_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsll.vi v9, v9, 1, v0.t
 ; CHECK-NEXT:    vsra.vi v9, v9, 1, v0.t
 ; CHECK-NEXT:    vrem.vv v8, v8, v9, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll
index efcf717231c280..8820e173f02fb5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode.ll
@@ -711,8 +711,8 @@ define <vscale x 1 x i64> @vremu_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
 ; RV32-V-NEXT:    addi a0, sp, 8
 ; RV32-V-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
 ; RV32-V-NEXT:    vlse64.v v9, (a0), zero
-; RV32-V-NEXT:    vmulhu.vv v9, v8, v9
 ; RV32-V-NEXT:    li a0, 61
+; RV32-V-NEXT:    vmulhu.vv v9, v8, v9
 ; RV32-V-NEXT:    vsrl.vx v9, v9, a0
 ; RV32-V-NEXT:    li a0, -7
 ; RV32-V-NEXT:    vnmsac.vx v8, a0, v9
@@ -818,8 +818,8 @@ define <vscale x 2 x i64> @vremu_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
 ; RV32-V-NEXT:    addi a0, sp, 8
 ; RV32-V-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
 ; RV32-V-NEXT:    vlse64.v v10, (a0), zero
-; RV32-V-NEXT:    vmulhu.vv v10, v8, v10
 ; RV32-V-NEXT:    li a0, 61
+; RV32-V-NEXT:    vmulhu.vv v10, v8, v10
 ; RV32-V-NEXT:    vsrl.vx v10, v10, a0
 ; RV32-V-NEXT:    li a0, -7
 ; RV32-V-NEXT:    vnmsac.vx v8, a0, v10
@@ -925,8 +925,8 @@ define <vscale x 4 x i64> @vremu_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
 ; RV32-V-NEXT:    addi a0, sp, 8
 ; RV32-V-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
 ; RV32-V-NEXT:    vlse64.v v12, (a0), zero
-; RV32-V-NEXT:    vmulhu.vv v12, v8, v12
 ; RV32-V-NEXT:    li a0, 61
+; RV32-V-NEXT:    vmulhu.vv v12, v8, v12
 ; RV32-V-NEXT:    vsrl.vx v12, v12, a0
 ; RV32-V-NEXT:    li a0, -7
 ; RV32-V-NEXT:    vnmsac.vx v8, a0, v12
@@ -1032,8 +1032,8 @@ define <vscale x 8 x i64> @vremu_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
 ; RV32-V-NEXT:    addi a0, sp, 8
 ; RV32-V-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
 ; RV32-V-NEXT:    vlse64.v v16, (a0), zero
-; RV32-V-NEXT:    vmulhu.vv v16, v8, v16
 ; RV32-V-NEXT:    li a0, 61
+; RV32-V-NEXT:    vmulhu.vv v16, v8, v16
 ; RV32-V-NEXT:    vsrl.vx v16, v16, a0
 ; RV32-V-NEXT:    li a0, -7
 ; RV32-V-NEXT:    vnmsac.vx v8, a0, v16

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
index 25ef62386699cb..6b32f101f6d3c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
@@ -10,11 +10,10 @@ define <vscale x 8 x i7> @vremu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <
 ; CHECK-LABEL: vremu_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vremu.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
index 4a86b717f9f3ce..cb925bfb0f237f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrol-sdnode.ll
@@ -11,8 +11,8 @@ define <vscale x 1 x i8> @vrol_vv_nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -33,8 +33,8 @@ define <vscale x 1 x i8> @vrol_vx_nxv1i8(<vscale x 1 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -58,8 +58,8 @@ define <vscale x 2 x i8> @vrol_vv_nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -80,8 +80,8 @@ define <vscale x 2 x i8> @vrol_vx_nxv2i8(<vscale x 2 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -105,8 +105,8 @@ define <vscale x 4 x i8> @vrol_vv_nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -127,8 +127,8 @@ define <vscale x 4 x i8> @vrol_vx_nxv4i8(<vscale x 4 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -152,8 +152,8 @@ define <vscale x 8 x i8> @vrol_vv_nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -174,8 +174,8 @@ define <vscale x 8 x i8> @vrol_vx_nxv8i8(<vscale x 8 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -199,8 +199,8 @@ define <vscale x 16 x i8> @vrol_vv_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vand.vi v12, v10, 7
-; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -221,8 +221,8 @@ define <vscale x 16 x i8> @vrol_vx_nxv16i8(<vscale x 16 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vand.vi v12, v10, 7
-; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -246,8 +246,8 @@ define <vscale x 32 x i8> @vrol_vv_nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vand.vi v16, v12, 7
-; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -268,8 +268,8 @@ define <vscale x 32 x i8> @vrol_vx_nxv32i8(<vscale x 32 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vand.vi v16, v12, 7
-; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -293,8 +293,8 @@ define <vscale x 64 x i8> @vrol_vv_nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vand.vi v24, v16, 7
-; CHECK-NEXT:    vsll.vv v24, v8, v24
 ; CHECK-NEXT:    vrsub.vi v16, v16, 0
+; CHECK-NEXT:    vsll.vv v24, v8, v24
 ; CHECK-NEXT:    vand.vi v16, v16, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v16
 ; CHECK-NEXT:    vor.vv v8, v24, v8
@@ -315,8 +315,8 @@ define <vscale x 64 x i8> @vrol_vx_nxv64i8(<vscale x 64 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v16, a0
 ; CHECK-NEXT:    vand.vi v24, v16, 7
-; CHECK-NEXT:    vsll.vv v24, v8, v24
 ; CHECK-NEXT:    vrsub.vi v16, v16, 0
+; CHECK-NEXT:    vsll.vv v24, v8, v24
 ; CHECK-NEXT:    vand.vi v16, v16, 7
 ; CHECK-NEXT:    vsrl.vv v8, v8, v16
 ; CHECK-NEXT:    vor.vv v8, v24, v8
@@ -340,8 +340,8 @@ define <vscale x 1 x i16> @vrol_vv_nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -362,8 +362,8 @@ define <vscale x 1 x i16> @vrol_vx_nxv1i16(<vscale x 1 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -387,8 +387,8 @@ define <vscale x 2 x i16> @vrol_vv_nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -409,8 +409,8 @@ define <vscale x 2 x i16> @vrol_vx_nxv2i16(<vscale x 2 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -434,8 +434,8 @@ define <vscale x 4 x i16> @vrol_vv_nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -456,8 +456,8 @@ define <vscale x 4 x i16> @vrol_vx_nxv4i16(<vscale x 4 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -481,8 +481,8 @@ define <vscale x 8 x i16> @vrol_vv_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vand.vi v12, v10, 15
-; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -503,8 +503,8 @@ define <vscale x 8 x i16> @vrol_vx_nxv8i16(<vscale x 8 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vand.vi v12, v10, 15
-; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -528,8 +528,8 @@ define <vscale x 16 x i16> @vrol_vv_nxv16i16(<vscale x 16 x i16> %a, <vscale x 1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vand.vi v16, v12, 15
-; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -550,8 +550,8 @@ define <vscale x 16 x i16> @vrol_vx_nxv16i16(<vscale x 16 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vand.vi v16, v12, 15
-; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -575,8 +575,8 @@ define <vscale x 32 x i16> @vrol_vv_nxv32i16(<vscale x 32 x i16> %a, <vscale x 3
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vand.vi v24, v16, 15
-; CHECK-NEXT:    vsll.vv v24, v8, v24
 ; CHECK-NEXT:    vrsub.vi v16, v16, 0
+; CHECK-NEXT:    vsll.vv v24, v8, v24
 ; CHECK-NEXT:    vand.vi v16, v16, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v16
 ; CHECK-NEXT:    vor.vv v8, v24, v8
@@ -597,8 +597,8 @@ define <vscale x 32 x i16> @vrol_vx_nxv32i16(<vscale x 32 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v16, a0
 ; CHECK-NEXT:    vand.vi v24, v16, 15
-; CHECK-NEXT:    vsll.vv v24, v8, v24
 ; CHECK-NEXT:    vrsub.vi v16, v16, 0
+; CHECK-NEXT:    vsll.vv v24, v8, v24
 ; CHECK-NEXT:    vand.vi v16, v16, 15
 ; CHECK-NEXT:    vsrl.vv v8, v8, v16
 ; CHECK-NEXT:    vor.vv v8, v24, v8
@@ -622,12 +622,12 @@ define <vscale x 1 x i32> @vrol_vv_nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vv v9, v8, v9
+; CHECK-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_nxv1i32:
@@ -643,9 +643,9 @@ define <vscale x 1 x i32> @vrol_vx_nxv1i32(<vscale x 1 x i32> %a, i32 %b) {
 ; CHECK-RV32-LABEL: vrol_vx_nxv1i32:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    andi a1, a0, 31
+; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; CHECK-RV32-NEXT:    vsll.vx v9, v8, a1
-; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    andi a0, a0, 31
 ; CHECK-RV32-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-RV32-NEXT:    vor.vv v8, v9, v8
@@ -657,8 +657,8 @@ define <vscale x 1 x i32> @vrol_vx_nxv1i32(<vscale x 1 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vmv.v.x v9, a0
 ; CHECK-RV64-NEXT:    li a0, 31
 ; CHECK-RV64-NEXT:    vand.vx v10, v9, a0
-; CHECK-RV64-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-RV64-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-RV64-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-RV64-NEXT:    vand.vx v9, v9, a0
 ; CHECK-RV64-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
@@ -682,12 +682,12 @@ define <vscale x 2 x i32> @vrol_vv_nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vv v9, v8, v9
+; CHECK-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_nxv2i32:
@@ -703,9 +703,9 @@ define <vscale x 2 x i32> @vrol_vx_nxv2i32(<vscale x 2 x i32> %a, i32 %b) {
 ; CHECK-RV32-LABEL: vrol_vx_nxv2i32:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    andi a1, a0, 31
+; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-RV32-NEXT:    vsll.vx v9, v8, a1
-; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    andi a0, a0, 31
 ; CHECK-RV32-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-RV32-NEXT:    vor.vv v8, v9, v8
@@ -717,8 +717,8 @@ define <vscale x 2 x i32> @vrol_vx_nxv2i32(<vscale x 2 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vmv.v.x v9, a0
 ; CHECK-RV64-NEXT:    li a0, 31
 ; CHECK-RV64-NEXT:    vand.vx v10, v9, a0
-; CHECK-RV64-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-RV64-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-RV64-NEXT:    vsll.vv v10, v8, v10
 ; CHECK-RV64-NEXT:    vand.vx v9, v9, a0
 ; CHECK-RV64-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
@@ -742,12 +742,12 @@ define <vscale x 4 x i32> @vrol_vv_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vand.vx v12, v10, a0
-; CHECK-NEXT:    vsll.vv v12, v8, v12
-; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vrsub.vi v12, v10, 0
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
+; CHECK-NEXT:    vsrl.vv v8, v8, v12
+; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_nxv4i32:
@@ -763,9 +763,9 @@ define <vscale x 4 x i32> @vrol_vx_nxv4i32(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-RV32-LABEL: vrol_vx_nxv4i32:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    andi a1, a0, 31
+; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; CHECK-RV32-NEXT:    vsll.vx v10, v8, a1
-; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    andi a0, a0, 31
 ; CHECK-RV32-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-RV32-NEXT:    vor.vv v8, v10, v8
@@ -777,8 +777,8 @@ define <vscale x 4 x i32> @vrol_vx_nxv4i32(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vmv.v.x v10, a0
 ; CHECK-RV64-NEXT:    li a0, 31
 ; CHECK-RV64-NEXT:    vand.vx v12, v10, a0
-; CHECK-RV64-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-RV64-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-RV64-NEXT:    vsll.vv v12, v8, v12
 ; CHECK-RV64-NEXT:    vand.vx v10, v10, a0
 ; CHECK-RV64-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-RV64-NEXT:    vor.vv v8, v12, v8
@@ -802,12 +802,12 @@ define <vscale x 8 x i32> @vrol_vv_nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vand.vx v16, v12, a0
-; CHECK-NEXT:    vsll.vv v16, v8, v16
-; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vrsub.vi v16, v12, 0
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v12
-; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
+; CHECK-NEXT:    vsrl.vv v8, v8, v16
+; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_nxv8i32:
@@ -823,9 +823,9 @@ define <vscale x 8 x i32> @vrol_vx_nxv8i32(<vscale x 8 x i32> %a, i32 %b) {
 ; CHECK-RV32-LABEL: vrol_vx_nxv8i32:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    andi a1, a0, 31
+; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; CHECK-RV32-NEXT:    vsll.vx v12, v8, a1
-; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    andi a0, a0, 31
 ; CHECK-RV32-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-RV32-NEXT:    vor.vv v8, v12, v8
@@ -837,8 +837,8 @@ define <vscale x 8 x i32> @vrol_vx_nxv8i32(<vscale x 8 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vmv.v.x v12, a0
 ; CHECK-RV64-NEXT:    li a0, 31
 ; CHECK-RV64-NEXT:    vand.vx v16, v12, a0
-; CHECK-RV64-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-RV64-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-RV64-NEXT:    vsll.vv v16, v8, v16
 ; CHECK-RV64-NEXT:    vand.vx v12, v12, a0
 ; CHECK-RV64-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-RV64-NEXT:    vor.vv v8, v16, v8
@@ -862,12 +862,12 @@ define <vscale x 16 x i32> @vrol_vv_nxv16i32(<vscale x 16 x i32> %a, <vscale x 1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vand.vx v24, v16, a0
-; CHECK-NEXT:    vsll.vv v24, v8, v24
-; CHECK-NEXT:    vrsub.vi v16, v16, 0
+; CHECK-NEXT:    vrsub.vi v24, v16, 0
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v16
-; CHECK-NEXT:    vor.vv v8, v24, v8
+; CHECK-NEXT:    vand.vx v24, v24, a0
+; CHECK-NEXT:    vsll.vv v16, v8, v16
+; CHECK-NEXT:    vsrl.vv v8, v8, v24
+; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_nxv16i32:
@@ -883,9 +883,9 @@ define <vscale x 16 x i32> @vrol_vx_nxv16i32(<vscale x 16 x i32> %a, i32 %b) {
 ; CHECK-RV32-LABEL: vrol_vx_nxv16i32:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    andi a1, a0, 31
+; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; CHECK-RV32-NEXT:    vsll.vx v16, v8, a1
-; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    andi a0, a0, 31
 ; CHECK-RV32-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-RV32-NEXT:    vor.vv v8, v16, v8
@@ -897,8 +897,8 @@ define <vscale x 16 x i32> @vrol_vx_nxv16i32(<vscale x 16 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vmv.v.x v16, a0
 ; CHECK-RV64-NEXT:    li a0, 31
 ; CHECK-RV64-NEXT:    vand.vx v24, v16, a0
-; CHECK-RV64-NEXT:    vsll.vv v24, v8, v24
 ; CHECK-RV64-NEXT:    vrsub.vi v16, v16, 0
+; CHECK-RV64-NEXT:    vsll.vv v24, v8, v24
 ; CHECK-RV64-NEXT:    vand.vx v16, v16, a0
 ; CHECK-RV64-NEXT:    vsrl.vv v8, v8, v16
 ; CHECK-RV64-NEXT:    vor.vv v8, v24, v8
@@ -922,12 +922,12 @@ define <vscale x 1 x i64> @vrol_vv_nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsll.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsll.vv v9, v8, v9
+; CHECK-NEXT:    vsrl.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_nxv1i64:
@@ -949,9 +949,9 @@ define <vscale x 1 x i64> @vrol_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vwsub.vx v11, v10, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
 ; CHECK-RV32-NEXT:    vand.vx v10, v11, a0
 ; CHECK-RV32-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
 ; CHECK-RV32-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    ret
@@ -959,9 +959,9 @@ define <vscale x 1 x i64> @vrol_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b) {
 ; CHECK-RV64-LABEL: vrol_vx_nxv1i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    andi a1, a0, 63
+; CHECK-RV64-NEXT:    negw a0, a0
 ; CHECK-RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
 ; CHECK-RV64-NEXT:    vsll.vx v9, v8, a1
-; CHECK-RV64-NEXT:    negw a0, a0
 ; CHECK-RV64-NEXT:    andi a0, a0, 63
 ; CHECK-RV64-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-RV64-NEXT:    vor.vv v8, v9, v8
@@ -985,12 +985,12 @@ define <vscale x 2 x i64> @vrol_vv_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vand.vx v12, v10, a0
-; CHECK-NEXT:    vsll.vv v12, v8, v12
-; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vrsub.vi v12, v10, 0
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsll.vv v10, v8, v10
+; CHECK-NEXT:    vsrl.vv v8, v8, v12
+; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_nxv2i64:
@@ -1012,9 +1012,9 @@ define <vscale x 2 x i64> @vrol_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vwsub.vx v14, v12, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
 ; CHECK-RV32-NEXT:    vand.vx v12, v14, a0
 ; CHECK-RV32-NEXT:    vsrl.vv v12, v8, v12
-; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
 ; CHECK-RV32-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    ret
@@ -1022,9 +1022,9 @@ define <vscale x 2 x i64> @vrol_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b) {
 ; CHECK-RV64-LABEL: vrol_vx_nxv2i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    andi a1, a0, 63
+; CHECK-RV64-NEXT:    negw a0, a0
 ; CHECK-RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
 ; CHECK-RV64-NEXT:    vsll.vx v10, v8, a1
-; CHECK-RV64-NEXT:    negw a0, a0
 ; CHECK-RV64-NEXT:    andi a0, a0, 63
 ; CHECK-RV64-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
@@ -1048,12 +1048,12 @@ define <vscale x 4 x i64> @vrol_vv_nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vand.vx v16, v12, a0
-; CHECK-NEXT:    vsll.vv v16, v8, v16
-; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vrsub.vi v16, v12, 0
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v12
-; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsll.vv v12, v8, v12
+; CHECK-NEXT:    vsrl.vv v8, v8, v16
+; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_nxv4i64:
@@ -1075,9 +1075,9 @@ define <vscale x 4 x i64> @vrol_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vwsub.vx v20, v16, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
 ; CHECK-RV32-NEXT:    vand.vx v16, v20, a0
 ; CHECK-RV32-NEXT:    vsrl.vv v16, v8, v16
-; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
 ; CHECK-RV32-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v16
 ; CHECK-RV32-NEXT:    ret
@@ -1085,9 +1085,9 @@ define <vscale x 4 x i64> @vrol_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b) {
 ; CHECK-RV64-LABEL: vrol_vx_nxv4i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    andi a1, a0, 63
+; CHECK-RV64-NEXT:    negw a0, a0
 ; CHECK-RV64-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
 ; CHECK-RV64-NEXT:    vsll.vx v12, v8, a1
-; CHECK-RV64-NEXT:    negw a0, a0
 ; CHECK-RV64-NEXT:    andi a0, a0, 63
 ; CHECK-RV64-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-RV64-NEXT:    vor.vv v8, v12, v8
@@ -1111,12 +1111,12 @@ define <vscale x 8 x i64> @vrol_vv_nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vand.vx v24, v16, a0
-; CHECK-NEXT:    vsll.vv v24, v8, v24
-; CHECK-NEXT:    vrsub.vi v16, v16, 0
+; CHECK-NEXT:    vrsub.vi v24, v16, 0
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsrl.vv v8, v8, v16
-; CHECK-NEXT:    vor.vv v8, v24, v8
+; CHECK-NEXT:    vand.vx v24, v24, a0
+; CHECK-NEXT:    vsll.vv v16, v8, v16
+; CHECK-NEXT:    vsrl.vv v8, v8, v24
+; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vrol_vv_nxv8i64:
@@ -1138,9 +1138,9 @@ define <vscale x 8 x i64> @vrol_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vwsub.vx v0, v24, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vand.vx v16, v16, a0
 ; CHECK-RV32-NEXT:    vand.vx v24, v0, a0
 ; CHECK-RV32-NEXT:    vsrl.vv v24, v8, v24
-; CHECK-RV32-NEXT:    vand.vx v16, v16, a0
 ; CHECK-RV32-NEXT:    vsll.vv v8, v8, v16
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v24
 ; CHECK-RV32-NEXT:    ret
@@ -1148,9 +1148,9 @@ define <vscale x 8 x i64> @vrol_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b) {
 ; CHECK-RV64-LABEL: vrol_vx_nxv8i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    andi a1, a0, 63
+; CHECK-RV64-NEXT:    negw a0, a0
 ; CHECK-RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vsll.vx v16, v8, a1
-; CHECK-RV64-NEXT:    negw a0, a0
 ; CHECK-RV64-NEXT:    andi a0, a0, 63
 ; CHECK-RV64-NEXT:    vsrl.vx v8, v8, a0
 ; CHECK-RV64-NEXT:    vor.vv v8, v16, v8

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
index cf2f0d8873165e..74743f9068a644 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vror-sdnode.ll
@@ -12,8 +12,8 @@ define <vscale x 1 x i8> @vror_vv_nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -34,8 +34,8 @@ define <vscale x 1 x i8> @vror_vx_nxv1i8(<vscale x 1 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -96,8 +96,8 @@ define <vscale x 2 x i8> @vror_vv_nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -118,8 +118,8 @@ define <vscale x 2 x i8> @vror_vx_nxv2i8(<vscale x 2 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -180,8 +180,8 @@ define <vscale x 4 x i8> @vror_vv_nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -202,8 +202,8 @@ define <vscale x 4 x i8> @vror_vx_nxv4i8(<vscale x 4 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -264,8 +264,8 @@ define <vscale x 8 x i8> @vror_vv_nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -286,8 +286,8 @@ define <vscale x 8 x i8> @vror_vx_nxv8i8(<vscale x 8 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 7
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -348,8 +348,8 @@ define <vscale x 16 x i8> @vror_vv_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vand.vi v12, v10, 7
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -370,8 +370,8 @@ define <vscale x 16 x i8> @vror_vx_nxv16i8(<vscale x 16 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vand.vi v12, v10, 7
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -432,8 +432,8 @@ define <vscale x 32 x i8> @vror_vv_nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vand.vi v16, v12, 7
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -454,8 +454,8 @@ define <vscale x 32 x i8> @vror_vx_nxv32i8(<vscale x 32 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vand.vi v16, v12, 7
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -516,8 +516,8 @@ define <vscale x 64 x i8> @vror_vv_nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vand.vi v24, v16, 7
-; CHECK-NEXT:    vsrl.vv v24, v8, v24
 ; CHECK-NEXT:    vrsub.vi v16, v16, 0
+; CHECK-NEXT:    vsrl.vv v24, v8, v24
 ; CHECK-NEXT:    vand.vi v16, v16, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v16
 ; CHECK-NEXT:    vor.vv v8, v24, v8
@@ -538,8 +538,8 @@ define <vscale x 64 x i8> @vror_vx_nxv64i8(<vscale x 64 x i8> %a, i8 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v16, a0
 ; CHECK-NEXT:    vand.vi v24, v16, 7
-; CHECK-NEXT:    vsrl.vv v24, v8, v24
 ; CHECK-NEXT:    vrsub.vi v16, v16, 0
+; CHECK-NEXT:    vsrl.vv v24, v8, v24
 ; CHECK-NEXT:    vand.vi v16, v16, 7
 ; CHECK-NEXT:    vsll.vv v8, v8, v16
 ; CHECK-NEXT:    vor.vv v8, v24, v8
@@ -600,8 +600,8 @@ define <vscale x 1 x i16> @vror_vv_nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -622,8 +622,8 @@ define <vscale x 1 x i16> @vror_vx_nxv1i16(<vscale x 1 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -684,8 +684,8 @@ define <vscale x 2 x i16> @vror_vv_nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -706,8 +706,8 @@ define <vscale x 2 x i16> @vror_vx_nxv2i16(<vscale x 2 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -768,8 +768,8 @@ define <vscale x 4 x i16> @vror_vv_nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -790,8 +790,8 @@ define <vscale x 4 x i16> @vror_vx_nxv4i16(<vscale x 4 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vand.vi v10, v9, 15
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-NEXT:    vand.vi v9, v9, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-NEXT:    vor.vv v8, v10, v8
@@ -852,8 +852,8 @@ define <vscale x 8 x i16> @vror_vv_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vand.vi v12, v10, 15
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -874,8 +874,8 @@ define <vscale x 8 x i16> @vror_vx_nxv8i16(<vscale x 8 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.v.x v10, a0
 ; CHECK-NEXT:    vand.vi v12, v10, 15
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-NEXT:    vand.vi v10, v10, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-NEXT:    vor.vv v8, v12, v8
@@ -936,8 +936,8 @@ define <vscale x 16 x i16> @vror_vv_nxv16i16(<vscale x 16 x i16> %a, <vscale x 1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vand.vi v16, v12, 15
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -958,8 +958,8 @@ define <vscale x 16 x i16> @vror_vx_nxv16i16(<vscale x 16 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vmv.v.x v12, a0
 ; CHECK-NEXT:    vand.vi v16, v12, 15
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-NEXT:    vand.vi v12, v12, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-NEXT:    vor.vv v8, v16, v8
@@ -1020,8 +1020,8 @@ define <vscale x 32 x i16> @vror_vv_nxv32i16(<vscale x 32 x i16> %a, <vscale x 3
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vand.vi v24, v16, 15
-; CHECK-NEXT:    vsrl.vv v24, v8, v24
 ; CHECK-NEXT:    vrsub.vi v16, v16, 0
+; CHECK-NEXT:    vsrl.vv v24, v8, v24
 ; CHECK-NEXT:    vand.vi v16, v16, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v16
 ; CHECK-NEXT:    vor.vv v8, v24, v8
@@ -1042,8 +1042,8 @@ define <vscale x 32 x i16> @vror_vx_nxv32i16(<vscale x 32 x i16> %a, i16 %b) {
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.x v16, a0
 ; CHECK-NEXT:    vand.vi v24, v16, 15
-; CHECK-NEXT:    vsrl.vv v24, v8, v24
 ; CHECK-NEXT:    vrsub.vi v16, v16, 0
+; CHECK-NEXT:    vsrl.vv v24, v8, v24
 ; CHECK-NEXT:    vand.vi v16, v16, 15
 ; CHECK-NEXT:    vsll.vv v8, v8, v16
 ; CHECK-NEXT:    vor.vv v8, v24, v8
@@ -1104,12 +1104,12 @@ define <vscale x 1 x i32> @vror_vv_nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vv v9, v8, v9
+; CHECK-NEXT:    vsll.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_nxv1i32:
@@ -1125,9 +1125,9 @@ define <vscale x 1 x i32> @vror_vx_nxv1i32(<vscale x 1 x i32> %a, i32 %b) {
 ; CHECK-RV32-LABEL: vror_vx_nxv1i32:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    andi a1, a0, 31
+; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
 ; CHECK-RV32-NEXT:    vsrl.vx v9, v8, a1
-; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    andi a0, a0, 31
 ; CHECK-RV32-NEXT:    vsll.vx v8, v8, a0
 ; CHECK-RV32-NEXT:    vor.vv v8, v9, v8
@@ -1139,8 +1139,8 @@ define <vscale x 1 x i32> @vror_vx_nxv1i32(<vscale x 1 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vmv.v.x v9, a0
 ; CHECK-RV64-NEXT:    li a0, 31
 ; CHECK-RV64-NEXT:    vand.vx v10, v9, a0
-; CHECK-RV64-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-RV64-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-RV64-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-RV64-NEXT:    vand.vx v9, v9, a0
 ; CHECK-RV64-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
@@ -1201,12 +1201,12 @@ define <vscale x 2 x i32> @vror_vv_nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vv v9, v8, v9
+; CHECK-NEXT:    vsll.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_nxv2i32:
@@ -1222,9 +1222,9 @@ define <vscale x 2 x i32> @vror_vx_nxv2i32(<vscale x 2 x i32> %a, i32 %b) {
 ; CHECK-RV32-LABEL: vror_vx_nxv2i32:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    andi a1, a0, 31
+; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-RV32-NEXT:    vsrl.vx v9, v8, a1
-; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    andi a0, a0, 31
 ; CHECK-RV32-NEXT:    vsll.vx v8, v8, a0
 ; CHECK-RV32-NEXT:    vor.vv v8, v9, v8
@@ -1236,8 +1236,8 @@ define <vscale x 2 x i32> @vror_vx_nxv2i32(<vscale x 2 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vmv.v.x v9, a0
 ; CHECK-RV64-NEXT:    li a0, 31
 ; CHECK-RV64-NEXT:    vand.vx v10, v9, a0
-; CHECK-RV64-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-RV64-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-RV64-NEXT:    vsrl.vv v10, v8, v10
 ; CHECK-RV64-NEXT:    vand.vx v9, v9, a0
 ; CHECK-RV64-NEXT:    vsll.vv v8, v8, v9
 ; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
@@ -1298,12 +1298,12 @@ define <vscale x 4 x i32> @vror_vv_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vand.vx v12, v10, a0
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
-; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vrsub.vi v12, v10, 0
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
+; CHECK-NEXT:    vsll.vv v8, v8, v12
+; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_nxv4i32:
@@ -1319,9 +1319,9 @@ define <vscale x 4 x i32> @vror_vx_nxv4i32(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-RV32-LABEL: vror_vx_nxv4i32:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    andi a1, a0, 31
+; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
 ; CHECK-RV32-NEXT:    vsrl.vx v10, v8, a1
-; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    andi a0, a0, 31
 ; CHECK-RV32-NEXT:    vsll.vx v8, v8, a0
 ; CHECK-RV32-NEXT:    vor.vv v8, v10, v8
@@ -1333,8 +1333,8 @@ define <vscale x 4 x i32> @vror_vx_nxv4i32(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vmv.v.x v10, a0
 ; CHECK-RV64-NEXT:    li a0, 31
 ; CHECK-RV64-NEXT:    vand.vx v12, v10, a0
-; CHECK-RV64-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-RV64-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-RV64-NEXT:    vsrl.vv v12, v8, v12
 ; CHECK-RV64-NEXT:    vand.vx v10, v10, a0
 ; CHECK-RV64-NEXT:    vsll.vv v8, v8, v10
 ; CHECK-RV64-NEXT:    vor.vv v8, v12, v8
@@ -1395,12 +1395,12 @@ define <vscale x 8 x i32> @vror_vv_nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vand.vx v16, v12, a0
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
-; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vrsub.vi v16, v12, 0
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v12
-; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
+; CHECK-NEXT:    vsll.vv v8, v8, v16
+; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_nxv8i32:
@@ -1416,9 +1416,9 @@ define <vscale x 8 x i32> @vror_vx_nxv8i32(<vscale x 8 x i32> %a, i32 %b) {
 ; CHECK-RV32-LABEL: vror_vx_nxv8i32:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    andi a1, a0, 31
+; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    vsetvli a2, zero, e32, m4, ta, ma
 ; CHECK-RV32-NEXT:    vsrl.vx v12, v8, a1
-; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    andi a0, a0, 31
 ; CHECK-RV32-NEXT:    vsll.vx v8, v8, a0
 ; CHECK-RV32-NEXT:    vor.vv v8, v12, v8
@@ -1430,8 +1430,8 @@ define <vscale x 8 x i32> @vror_vx_nxv8i32(<vscale x 8 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vmv.v.x v12, a0
 ; CHECK-RV64-NEXT:    li a0, 31
 ; CHECK-RV64-NEXT:    vand.vx v16, v12, a0
-; CHECK-RV64-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-RV64-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-RV64-NEXT:    vsrl.vv v16, v8, v16
 ; CHECK-RV64-NEXT:    vand.vx v12, v12, a0
 ; CHECK-RV64-NEXT:    vsll.vv v8, v8, v12
 ; CHECK-RV64-NEXT:    vor.vv v8, v16, v8
@@ -1492,12 +1492,12 @@ define <vscale x 16 x i32> @vror_vv_nxv16i32(<vscale x 16 x i32> %a, <vscale x 1
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 31
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vand.vx v24, v16, a0
-; CHECK-NEXT:    vsrl.vv v24, v8, v24
-; CHECK-NEXT:    vrsub.vi v16, v16, 0
+; CHECK-NEXT:    vrsub.vi v24, v16, 0
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v16
-; CHECK-NEXT:    vor.vv v8, v24, v8
+; CHECK-NEXT:    vand.vx v24, v24, a0
+; CHECK-NEXT:    vsrl.vv v16, v8, v16
+; CHECK-NEXT:    vsll.vv v8, v8, v24
+; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_nxv16i32:
@@ -1513,9 +1513,9 @@ define <vscale x 16 x i32> @vror_vx_nxv16i32(<vscale x 16 x i32> %a, i32 %b) {
 ; CHECK-RV32-LABEL: vror_vx_nxv16i32:
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    andi a1, a0, 31
+; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
 ; CHECK-RV32-NEXT:    vsrl.vx v16, v8, a1
-; CHECK-RV32-NEXT:    neg a0, a0
 ; CHECK-RV32-NEXT:    andi a0, a0, 31
 ; CHECK-RV32-NEXT:    vsll.vx v8, v8, a0
 ; CHECK-RV32-NEXT:    vor.vv v8, v16, v8
@@ -1527,8 +1527,8 @@ define <vscale x 16 x i32> @vror_vx_nxv16i32(<vscale x 16 x i32> %a, i32 %b) {
 ; CHECK-RV64-NEXT:    vmv.v.x v16, a0
 ; CHECK-RV64-NEXT:    li a0, 31
 ; CHECK-RV64-NEXT:    vand.vx v24, v16, a0
-; CHECK-RV64-NEXT:    vsrl.vv v24, v8, v24
 ; CHECK-RV64-NEXT:    vrsub.vi v16, v16, 0
+; CHECK-RV64-NEXT:    vsrl.vv v24, v8, v24
 ; CHECK-RV64-NEXT:    vand.vx v16, v16, a0
 ; CHECK-RV64-NEXT:    vsll.vv v8, v8, v16
 ; CHECK-RV64-NEXT:    vor.vv v8, v24, v8
@@ -1589,12 +1589,12 @@ define <vscale x 1 x i64> @vror_vv_nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vand.vx v10, v9, a0
-; CHECK-NEXT:    vsrl.vv v10, v8, v10
-; CHECK-NEXT:    vrsub.vi v9, v9, 0
+; CHECK-NEXT:    vrsub.vi v10, v9, 0
 ; CHECK-NEXT:    vand.vx v9, v9, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v9
-; CHECK-NEXT:    vor.vv v8, v10, v8
+; CHECK-NEXT:    vand.vx v10, v10, a0
+; CHECK-NEXT:    vsrl.vv v9, v8, v9
+; CHECK-NEXT:    vsll.vv v8, v8, v10
+; CHECK-NEXT:    vor.vv v8, v9, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_nxv1i64:
@@ -1616,9 +1616,9 @@ define <vscale x 1 x i64> @vror_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vwsub.vx v11, v10, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
 ; CHECK-RV32-NEXT:    vand.vx v10, v11, a0
 ; CHECK-RV32-NEXT:    vsll.vv v10, v8, v10
-; CHECK-RV32-NEXT:    vand.vx v9, v9, a0
 ; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    ret
@@ -1626,9 +1626,9 @@ define <vscale x 1 x i64> @vror_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b) {
 ; CHECK-RV64-LABEL: vror_vx_nxv1i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    andi a1, a0, 63
+; CHECK-RV64-NEXT:    negw a0, a0
 ; CHECK-RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
 ; CHECK-RV64-NEXT:    vsrl.vx v9, v8, a1
-; CHECK-RV64-NEXT:    negw a0, a0
 ; CHECK-RV64-NEXT:    andi a0, a0, 63
 ; CHECK-RV64-NEXT:    vsll.vx v8, v8, a0
 ; CHECK-RV64-NEXT:    vor.vv v8, v9, v8
@@ -1691,12 +1691,12 @@ define <vscale x 2 x i64> @vror_vv_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vand.vx v12, v10, a0
-; CHECK-NEXT:    vsrl.vv v12, v8, v12
-; CHECK-NEXT:    vrsub.vi v10, v10, 0
+; CHECK-NEXT:    vrsub.vi v12, v10, 0
 ; CHECK-NEXT:    vand.vx v10, v10, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v10
-; CHECK-NEXT:    vor.vv v8, v12, v8
+; CHECK-NEXT:    vand.vx v12, v12, a0
+; CHECK-NEXT:    vsrl.vv v10, v8, v10
+; CHECK-NEXT:    vsll.vv v8, v8, v12
+; CHECK-NEXT:    vor.vv v8, v10, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_nxv2i64:
@@ -1718,9 +1718,9 @@ define <vscale x 2 x i64> @vror_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vwsub.vx v14, v12, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
 ; CHECK-RV32-NEXT:    vand.vx v12, v14, a0
 ; CHECK-RV32-NEXT:    vsll.vv v12, v8, v12
-; CHECK-RV32-NEXT:    vand.vx v10, v10, a0
 ; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v10
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    ret
@@ -1728,9 +1728,9 @@ define <vscale x 2 x i64> @vror_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b) {
 ; CHECK-RV64-LABEL: vror_vx_nxv2i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    andi a1, a0, 63
+; CHECK-RV64-NEXT:    negw a0, a0
 ; CHECK-RV64-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
 ; CHECK-RV64-NEXT:    vsrl.vx v10, v8, a1
-; CHECK-RV64-NEXT:    negw a0, a0
 ; CHECK-RV64-NEXT:    andi a0, a0, 63
 ; CHECK-RV64-NEXT:    vsll.vx v8, v8, a0
 ; CHECK-RV64-NEXT:    vor.vv v8, v10, v8
@@ -1793,12 +1793,12 @@ define <vscale x 4 x i64> @vror_vv_nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
-; CHECK-NEXT:    vand.vx v16, v12, a0
-; CHECK-NEXT:    vsrl.vv v16, v8, v16
-; CHECK-NEXT:    vrsub.vi v12, v12, 0
+; CHECK-NEXT:    vrsub.vi v16, v12, 0
 ; CHECK-NEXT:    vand.vx v12, v12, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v12
-; CHECK-NEXT:    vor.vv v8, v16, v8
+; CHECK-NEXT:    vand.vx v16, v16, a0
+; CHECK-NEXT:    vsrl.vv v12, v8, v12
+; CHECK-NEXT:    vsll.vv v8, v8, v16
+; CHECK-NEXT:    vor.vv v8, v12, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_nxv4i64:
@@ -1820,9 +1820,9 @@ define <vscale x 4 x i64> @vror_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vwsub.vx v20, v16, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
 ; CHECK-RV32-NEXT:    vand.vx v16, v20, a0
 ; CHECK-RV32-NEXT:    vsll.vv v16, v8, v16
-; CHECK-RV32-NEXT:    vand.vx v12, v12, a0
 ; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v12
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v16
 ; CHECK-RV32-NEXT:    ret
@@ -1830,9 +1830,9 @@ define <vscale x 4 x i64> @vror_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b) {
 ; CHECK-RV64-LABEL: vror_vx_nxv4i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    andi a1, a0, 63
+; CHECK-RV64-NEXT:    negw a0, a0
 ; CHECK-RV64-NEXT:    vsetvli a2, zero, e64, m4, ta, ma
 ; CHECK-RV64-NEXT:    vsrl.vx v12, v8, a1
-; CHECK-RV64-NEXT:    negw a0, a0
 ; CHECK-RV64-NEXT:    andi a0, a0, 63
 ; CHECK-RV64-NEXT:    vsll.vx v8, v8, a0
 ; CHECK-RV64-NEXT:    vor.vv v8, v12, v8
@@ -1895,12 +1895,12 @@ define <vscale x 8 x i64> @vror_vv_nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 63
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vand.vx v24, v16, a0
-; CHECK-NEXT:    vsrl.vv v24, v8, v24
-; CHECK-NEXT:    vrsub.vi v16, v16, 0
+; CHECK-NEXT:    vrsub.vi v24, v16, 0
 ; CHECK-NEXT:    vand.vx v16, v16, a0
-; CHECK-NEXT:    vsll.vv v8, v8, v16
-; CHECK-NEXT:    vor.vv v8, v24, v8
+; CHECK-NEXT:    vand.vx v24, v24, a0
+; CHECK-NEXT:    vsrl.vv v16, v8, v16
+; CHECK-NEXT:    vsll.vv v8, v8, v24
+; CHECK-NEXT:    vor.vv v8, v16, v8
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vror_vv_nxv8i64:
@@ -1922,9 +1922,9 @@ define <vscale x 8 x i64> @vror_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b) {
 ; CHECK-RV32-NEXT:    vwsub.vx v0, v24, a0
 ; CHECK-RV32-NEXT:    li a0, 63
 ; CHECK-RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-RV32-NEXT:    vand.vx v16, v16, a0
 ; CHECK-RV32-NEXT:    vand.vx v24, v0, a0
 ; CHECK-RV32-NEXT:    vsll.vv v24, v8, v24
-; CHECK-RV32-NEXT:    vand.vx v16, v16, a0
 ; CHECK-RV32-NEXT:    vsrl.vv v8, v8, v16
 ; CHECK-RV32-NEXT:    vor.vv v8, v8, v24
 ; CHECK-RV32-NEXT:    ret
@@ -1932,9 +1932,9 @@ define <vscale x 8 x i64> @vror_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b) {
 ; CHECK-RV64-LABEL: vror_vx_nxv8i64:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    andi a1, a0, 63
+; CHECK-RV64-NEXT:    negw a0, a0
 ; CHECK-RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; CHECK-RV64-NEXT:    vsrl.vx v16, v8, a1
-; CHECK-RV64-NEXT:    negw a0, a0
 ; CHECK-RV64-NEXT:    andi a0, a0, 63
 ; CHECK-RV64-NEXT:    vsll.vx v8, v8, a0
 ; CHECK-RV64-NEXT:    vor.vv v8, v16, v8

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
index d71fa33cf1f23b..12c439346e3569 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
@@ -1353,9 +1353,9 @@ define <vscale x 32 x i32> @vsadd_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
index eb57741e701dd4..d962f703abfd22 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
@@ -11,8 +11,8 @@ define <vscale x 8 x i7> @vsaddu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
@@ -1352,9 +1352,9 @@ define <vscale x 32 x i32> @vsaddu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll
index d2314ebfc3d2b5..b2a719b1f7901d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll
@@ -92,9 +92,9 @@ define i64 @con2048_minus_rem() {
 ; CHECK-LABEL: con2048_minus_rem:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    neg a0, a0
-; CHECK-NEXT:    li a1, 1
 ; CHECK-NEXT:    slli a1, a1, 11
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
index a14ce717261536..58b6f0253b99af 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll
@@ -156,6 +156,7 @@ define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y,
 ; NO_FOLDING-NEXT:    vlm.v v9, (a1)
 ; NO_FOLDING-NEXT:    vlm.v v10, (a2)
 ; NO_FOLDING-NEXT:    vmv.v.i v11, 0
+; NO_FOLDING-NEXT:    li a0, 1
 ; NO_FOLDING-NEXT:    vmv.v.v v0, v8
 ; NO_FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
 ; NO_FOLDING-NEXT:    vmv.v.v v0, v9
@@ -163,7 +164,6 @@ define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y,
 ; NO_FOLDING-NEXT:    vmv.v.v v0, v10
 ; NO_FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
 ; NO_FOLDING-NEXT:    vmul.vv v9, v12, v9
-; NO_FOLDING-NEXT:    li a0, 1
 ; NO_FOLDING-NEXT:    vsub.vv v11, v12, v10
 ; NO_FOLDING-NEXT:    vmv.v.v v0, v8
 ; NO_FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
@@ -178,6 +178,7 @@ define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y,
 ; FOLDING-NEXT:    vlm.v v9, (a1)
 ; FOLDING-NEXT:    vlm.v v10, (a2)
 ; FOLDING-NEXT:    vmv.v.i v11, 0
+; FOLDING-NEXT:    li a0, 1
 ; FOLDING-NEXT:    vmv.v.v v0, v8
 ; FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
 ; FOLDING-NEXT:    vmv.v.v v0, v9
@@ -185,7 +186,6 @@ define <vscale x 2 x i32> @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y,
 ; FOLDING-NEXT:    vmv.v.v v0, v10
 ; FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
 ; FOLDING-NEXT:    vmul.vv v9, v12, v9
-; FOLDING-NEXT:    li a0, 1
 ; FOLDING-NEXT:    vsub.vv v11, v12, v10
 ; FOLDING-NEXT:    vmv.v.v v0, v8
 ; FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
@@ -214,6 +214,7 @@ define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; NO_FOLDING-NEXT:    vlm.v v9, (a1)
 ; NO_FOLDING-NEXT:    vlm.v v10, (a2)
 ; NO_FOLDING-NEXT:    vmv.v.i v11, 0
+; NO_FOLDING-NEXT:    li a0, 1
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v8
 ; NO_FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v9
@@ -221,7 +222,6 @@ define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v10
 ; NO_FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
 ; NO_FOLDING-NEXT:    vmul.vv v9, v12, v9
-; NO_FOLDING-NEXT:    li a0, 1
 ; NO_FOLDING-NEXT:    vsub.vv v11, v12, v10
 ; NO_FOLDING-NEXT:    vmv1r.v v0, v8
 ; NO_FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t
@@ -236,6 +236,7 @@ define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; FOLDING-NEXT:    vlm.v v9, (a1)
 ; FOLDING-NEXT:    vlm.v v10, (a2)
 ; FOLDING-NEXT:    vmv.v.i v11, 0
+; FOLDING-NEXT:    li a0, 1
 ; FOLDING-NEXT:    vmv1r.v v0, v8
 ; FOLDING-NEXT:    vmerge.vim v12, v11, -1, v0
 ; FOLDING-NEXT:    vmv1r.v v0, v9
@@ -243,7 +244,6 @@ define <vscale x 2 x i8> @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p
 ; FOLDING-NEXT:    vmv1r.v v0, v10
 ; FOLDING-NEXT:    vmerge.vim v10, v11, -1, v0
 ; FOLDING-NEXT:    vmul.vv v9, v12, v9
-; FOLDING-NEXT:    li a0, 1
 ; FOLDING-NEXT:    vsub.vv v11, v12, v10
 ; FOLDING-NEXT:    vmv1r.v v0, v8
 ; FOLDING-NEXT:    vsub.vx v10, v10, a0, v0.t

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
index 1f1a62f57664f3..1fc33dc73a27dc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
@@ -493,17 +493,16 @@ define <vscale x 8 x double> @vfmerge_nzv_nxv8f64(<vscale x 8 x double> %va, <vs
 define <vscale x 16 x double> @vselect_combine_regression(<vscale x 16 x i64> %va, <vscale x 16 x double> %vb) {
 ; CHECK-LABEL: vselect_combine_regression:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv8r.v v24, v16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vmseq.vi v24, v16, 0
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
 ; CHECK-NEXT:    vmv.v.i v16, 0
-; CHECK-NEXT:    vmseq.vi v7, v24, 0
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    vle64.v v8, (a0), v0.t
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vle64.v v16, (a1), v0.t
 ; CHECK-NEXT:    ret
   %cond = icmp eq <vscale x 16 x i64> %va, zeroinitializer
@@ -516,11 +515,13 @@ define void @vselect_legalize_regression(<vscale x 16 x double> %a, <vscale x 16
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vlm.v v24, (a0)
-; CHECK-NEXT:    vmand.mm v7, v0, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a2, a0, 3
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    vmand.mm v7, v0, v24
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v24, 0
 ; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
@@ -528,8 +529,6 @@ define void @vselect_legalize_regression(<vscale x 16 x double> %a, <vscale x 16
 ; CHECK-NEXT:    vmv.v.i v24, 0
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    vs8r.v v8, (a1)
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    vs8r.v v16, (a0)
 ; CHECK-NEXT:    ret
   %cond = and <vscale x 16 x i1> %ma, %mb

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index 6483a99fe3632d..bb51f0592dc17a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -354,51 +354,30 @@ define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a1, a3, 3
-; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    vl8re32.v v8, (a1)
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    slli a4, a3, 3
 ; CHECK-NEXT:    slli a1, a3, 1
-; CHECK-NEXT:    sub a4, a2, a1
-; CHECK-NEXT:    sltu a5, a2, a4
-; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    srli a3, a3, 2
-; CHECK-NEXT:    vl8re32.v v8, (a0)
+; CHECK-NEXT:    add a4, a0, a4
+; CHECK-NEXT:    sub a5, a2, a1
+; CHECK-NEXT:    vl8re32.v v8, (a4)
+; CHECK-NEXT:    sltu a4, a2, a5
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    vl8re32.v v0, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a3
-; CHECK-NEXT:    and a4, a5, a4
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vslidedown.vx v0, v24, a3
+; CHECK-NEXT:    and a4, a4, a5
 ; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
 ; CHECK-NEXT:    bltu a2, a1, .LBB27_2
@@ -407,17 +386,16 @@ define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32
 ; CHECK-NEXT:  .LBB27_2:
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -435,51 +413,30 @@ define <vscale x 32 x i32> @select_evl_nxv32i32(<vscale x 32 x i1> %a, <vscale x
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 3
-; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vl8re32.v v8, (a2)
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    slli a2, a1, 1
-; CHECK-NEXT:    sub a3, a1, a2
-; CHECK-NEXT:    sltu a4, a1, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    srli a5, a1, 2
-; CHECK-NEXT:    vl8re32.v v8, (a0)
+; CHECK-NEXT:    srli a4, a1, 2
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    sub a5, a1, a2
+; CHECK-NEXT:    vl8re32.v v8, (a3)
+; CHECK-NEXT:    sltu a3, a1, a5
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    vl8re32.v v0, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a5
-; CHECK-NEXT:    and a3, a4, a3
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vslidedown.vx v0, v24, a4
+; CHECK-NEXT:    and a3, a3, a5
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
 ; CHECK-NEXT:    bltu a1, a2, .LBB28_2
@@ -488,17 +445,16 @@ define <vscale x 32 x i32> @select_evl_nxv32i32(<vscale x 32 x i1> %a, <vscale x
 ; CHECK-NEXT:  .LBB28_2:
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -757,19 +713,19 @@ define <vscale x 16 x double> @select_nxv16f64(<vscale x 16 x i1> %a, <vscale x
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
+; CHECK-NEXT:    sub a4, a2, a1
 ; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    sltu a5, a2, a4
 ; CHECK-NEXT:    vl8re64.v v8, (a3)
-; CHECK-NEXT:    sub a3, a2, a1
-; CHECK-NEXT:    sltu a4, a2, a3
-; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    srli a5, a1, 3
+; CHECK-NEXT:    addi a5, a5, -1
+; CHECK-NEXT:    srli a3, a1, 3
 ; CHECK-NEXT:    vl8re64.v v0, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a5
-; CHECK-NEXT:    and a3, a4, a3
-; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v24, a3
+; CHECK-NEXT:    and a4, a5, a4
+; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
 ; CHECK-NEXT:    bltu a2, a1, .LBB48_2
 ; CHECK-NEXT:  # %bb.1:

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
index 027c81180d5f19..80dd87ce5da856 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
@@ -448,11 +448,11 @@ define void @saxpy_vec(i64 %n, float %a, ptr nocapture readonly %x, ptr nocaptur
 ; CHECK-NEXT:    vle32.v v8, (a1)
 ; CHECK-NEXT:    vle32.v v16, (a2)
 ; CHECK-NEXT:    slli a4, a3, 2
+; CHECK-NEXT:    sub a0, a0, a3
 ; CHECK-NEXT:    add a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, tu, ma
 ; CHECK-NEXT:    vfmacc.vf v16, fa0, v8
 ; CHECK-NEXT:    vse32.v v16, (a2)
-; CHECK-NEXT:    sub a0, a0, a3
 ; CHECK-NEXT:    vsetvli a3, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    add a2, a2, a4
 ; CHECK-NEXT:    bnez a3, .LBB8_1
@@ -496,11 +496,11 @@ define void @saxpy_vec_demanded_fields(i64 %n, float %a, ptr nocapture readonly
 ; CHECK-NEXT:    vle32.v v8, (a1)
 ; CHECK-NEXT:    vle32.v v16, (a2)
 ; CHECK-NEXT:    slli a4, a3, 2
+; CHECK-NEXT:    sub a0, a0, a3
 ; CHECK-NEXT:    add a1, a1, a4
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, tu, ma
 ; CHECK-NEXT:    vfmacc.vf v16, fa0, v8
 ; CHECK-NEXT:    vse32.v v16, (a2)
-; CHECK-NEXT:    sub a0, a0, a3
 ; CHECK-NEXT:    vsetvli a3, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    add a2, a2, a4
 ; CHECK-NEXT:    bnez a3, .LBB9_1
@@ -640,11 +640,11 @@ define void @vlmax(i64 %N, ptr %c, ptr %a, ptr %b) {
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vle64.v v8, (a2)
 ; CHECK-NEXT:    vle64.v v9, (a3)
+; CHECK-NEXT:    add a4, a4, a6
+; CHECK-NEXT:    add a3, a3, a5
 ; CHECK-NEXT:    vfadd.vv v8, v8, v9
 ; CHECK-NEXT:    vse64.v v8, (a1)
-; CHECK-NEXT:    add a4, a4, a6
 ; CHECK-NEXT:    add a1, a1, a5
-; CHECK-NEXT:    add a3, a3, a5
 ; CHECK-NEXT:    add a2, a2, a5
 ; CHECK-NEXT:    blt a4, a0, .LBB12_2
 ; CHECK-NEXT:  .LBB12_3: # %for.end

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
index 7f01fd4d945c6a..b0cb6bc6125ddf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
@@ -111,12 +111,12 @@ define void @test6(ptr nocapture readonly %A, ptr nocapture %B, i64 %n) {
 ; CHECK-NEXT:    slli a4, a3, 2
 ; CHECK-NEXT:    add a5, a0, a4
 ; CHECK-NEXT:    vle32.v v8, (a5)
+; CHECK-NEXT:    add a3, a3, a2
 ; CHECK-NEXT:    vmsle.vi v9, v8, -3
 ; CHECK-NEXT:    vmsgt.vi v10, v8, 2
 ; CHECK-NEXT:    vmor.mm v0, v9, v10
 ; CHECK-NEXT:    add a4, a4, a1
 ; CHECK-NEXT:    vse32.v v8, (a4), v0.t
-; CHECK-NEXT:    add a3, a3, a2
 ; CHECK-NEXT:    vsetvli a2, a2, e32, m1, ta, ma
 ; CHECK-NEXT:    bnez a2, .LBB5_2
 ; CHECK-NEXT:  .LBB5_3: # %for.cond.cleanup

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
index 9b5a1a54ad5dfa..d3b905ef897b1b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsext-vp.ll
@@ -154,9 +154,9 @@ define <vscale x 32 x i32> @vsext_nxv32i8_nxv32i32(<vscale x 32 x i8> %a, <vscal
 ; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
index b446ca90e0f45b..581cc666b6cbd5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsitofp-vp.ll
@@ -508,42 +508,26 @@ declare <vscale x 32 x half> @llvm.vp.sitofp.nxv32f16.nxv32i32(<vscale x 32 x i3
 define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vsitofp_nxv32f16_nxv32i32:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    addi sp, sp, -16
-; ZVFH-NEXT:    .cfi_def_cfa_offset 16
-; ZVFH-NEXT:    csrr a1, vlenb
-; ZVFH-NEXT:    slli a1, a1, 3
-; ZVFH-NEXT:    sub sp, sp, a1
-; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFH-NEXT:    vmv1r.v v7, v0
-; ZVFH-NEXT:    addi a1, sp, 16
-; ZVFH-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vmv1r.v v24, v0
 ; ZVFH-NEXT:    csrr a1, vlenb
 ; ZVFH-NEXT:    srli a2, a1, 2
+; ZVFH-NEXT:    slli a1, a1, 1
 ; ZVFH-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; ZVFH-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFH-NEXT:    slli a1, a1, 1
 ; ZVFH-NEXT:    sub a2, a0, a1
 ; ZVFH-NEXT:    sltu a3, a0, a2
 ; ZVFH-NEXT:    addi a3, a3, -1
 ; ZVFH-NEXT:    and a2, a3, a2
-; ZVFH-NEXT:    addi a3, sp, 16
-; ZVFH-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFH-NEXT:    vfncvt.f.x.w v20, v24, v0.t
+; ZVFH-NEXT:    vfncvt.f.x.w v28, v16, v0.t
 ; ZVFH-NEXT:    bltu a0, a1, .LBB34_2
 ; ZVFH-NEXT:  # %bb.1:
 ; ZVFH-NEXT:    mv a0, a1
 ; ZVFH-NEXT:  .LBB34_2:
-; ZVFH-NEXT:    vmv1r.v v0, v7
+; ZVFH-NEXT:    vmv1r.v v0, v24
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT:    vfncvt.f.x.w v16, v8, v0.t
-; ZVFH-NEXT:    vmv8r.v v8, v16
-; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 3
-; ZVFH-NEXT:    add sp, sp, a0
-; ZVFH-NEXT:    .cfi_def_cfa sp, 16
-; ZVFH-NEXT:    addi sp, sp, 16
-; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    vfncvt.f.x.w v24, v8, v0.t
+; ZVFH-NEXT:    vmv8r.v v8, v24
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vsitofp_nxv32f16_nxv32i32:
@@ -551,9 +535,9 @@ define <vscale x 32 x half> @vsitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    srli a2, a1, 2
+; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
 ; ZVFHMIN-NEXT:    sltu a3, a0, a2
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
@@ -585,9 +569,9 @@ define <vscale x 32 x float> @vsitofp_nxv32f32_nxv32i32(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll
index cff8cc710d21f3..208063bfd23426 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll
@@ -11,11 +11,11 @@ define <vscale x 8 x i7> @vsra_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <v
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    li a0, 127
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsra.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
 ; CHECK-NEXT:    vsra.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll
index ff6771b643031f..c276599795908b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll
@@ -10,11 +10,10 @@ define <vscale x 8 x i7> @vsrl_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <v
 ; CHECK-LABEL: vsrl_vx_nxv8i7:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
-; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a0
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vand.vx v9, v9, a2, v0.t
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9, v0.t
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
index 0b8cbc46b6c979..f9c24eeec31c56 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
@@ -1395,9 +1395,9 @@ define <vscale x 32 x i32> @vssub_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
index 370be2e3260be3..04a1b522a8a33a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
@@ -11,8 +11,8 @@ define <vscale x 8 x i7> @vssubu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b,
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 127
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vand.vx v8, v8, a2
 ; CHECK-NEXT:    vand.vx v9, v9, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
 ; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
@@ -1393,9 +1393,9 @@ define <vscale x 32 x i32> @vssubu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
index b9697770994ef3..e62b7a00396388 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
@@ -160,12 +160,12 @@ define <vscale x 15 x i16> @vtrunc_nxv15i16_nxv15i64(<vscale x 15 x i64> %a, <vs
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
-; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    sub a3, a0, a1
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sub a2, a0, a1
-; CHECK-NEXT:    sltu a3, a0, a2
-; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    sltu a2, a0, a3
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v28, v16, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
@@ -217,9 +217,9 @@ define <vscale x 32 x i7> @vtrunc_nxv32i7_nxv32i32(<vscale x 32 x i32> %a, <vsca
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
@@ -251,9 +251,9 @@ define <vscale x 32 x i8> @vtrunc_nxv32i8_nxv32i32(<vscale x 32 x i32> %a, <vsca
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1
@@ -298,35 +298,35 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a3, a1, 3
-; CHECK-NEXT:    srli a4, a1, 2
-; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v16, v0, a4
-; CHECK-NEXT:    slli a4, a1, 3
-; CHECK-NEXT:    add a4, a0, a4
-; CHECK-NEXT:    vl8re64.v v8, (a4)
+; CHECK-NEXT:    srli a5, a1, 2
+; CHECK-NEXT:    slli a6, a1, 3
 ; CHECK-NEXT:    slli a4, a1, 1
+; CHECK-NEXT:    vsetvli a7, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v16, v0, a5
+; CHECK-NEXT:    add a6, a0, a6
 ; CHECK-NEXT:    sub a5, a2, a4
+; CHECK-NEXT:    vl8re64.v v24, (a6)
 ; CHECK-NEXT:    sltu a6, a2, a5
 ; CHECK-NEXT:    addi a6, a6, -1
 ; CHECK-NEXT:    and a5, a6, a5
 ; CHECK-NEXT:    sub a6, a5, a1
 ; CHECK-NEXT:    sltu a7, a5, a6
 ; CHECK-NEXT:    addi a7, a7, -1
-; CHECK-NEXT:    vl8re64.v v24, (a0)
+; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v16, a3
 ; CHECK-NEXT:    and a0, a7, a6
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v20, v8, 0, v0.t
+; CHECK-NEXT:    vnsrl.wi v20, v24, 0, v0.t
 ; CHECK-NEXT:    bltu a5, a1, .LBB17_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a5, a1
 ; CHECK-NEXT:  .LBB17_2:
+; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v6, v7, a3
-; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vsetvli zero, a5, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v16, v24, 0, v0.t
+; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; CHECK-NEXT:    bltu a2, a4, .LBB17_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a2, a4

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
index dcb4904d40007d..2c5a01279d5d37 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vuitofp-vp.ll
@@ -500,42 +500,26 @@ declare <vscale x 32 x half> @llvm.vp.uitofp.nxv32f16.nxv32i32(<vscale x 32 x i3
 define <vscale x 32 x half> @vuitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; ZVFH-LABEL: vuitofp_nxv32f16_nxv32i32:
 ; ZVFH:       # %bb.0:
-; ZVFH-NEXT:    addi sp, sp, -16
-; ZVFH-NEXT:    .cfi_def_cfa_offset 16
-; ZVFH-NEXT:    csrr a1, vlenb
-; ZVFH-NEXT:    slli a1, a1, 3
-; ZVFH-NEXT:    sub sp, sp, a1
-; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFH-NEXT:    vmv1r.v v7, v0
-; ZVFH-NEXT:    addi a1, sp, 16
-; ZVFH-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vmv1r.v v24, v0
 ; ZVFH-NEXT:    csrr a1, vlenb
 ; ZVFH-NEXT:    srli a2, a1, 2
+; ZVFH-NEXT:    slli a1, a1, 1
 ; ZVFH-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; ZVFH-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFH-NEXT:    slli a1, a1, 1
 ; ZVFH-NEXT:    sub a2, a0, a1
 ; ZVFH-NEXT:    sltu a3, a0, a2
 ; ZVFH-NEXT:    addi a3, a3, -1
 ; ZVFH-NEXT:    and a2, a3, a2
-; ZVFH-NEXT:    addi a3, sp, 16
-; ZVFH-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; ZVFH-NEXT:    vfncvt.f.xu.w v20, v24, v0.t
+; ZVFH-NEXT:    vfncvt.f.xu.w v28, v16, v0.t
 ; ZVFH-NEXT:    bltu a0, a1, .LBB34_2
 ; ZVFH-NEXT:  # %bb.1:
 ; ZVFH-NEXT:    mv a0, a1
 ; ZVFH-NEXT:  .LBB34_2:
-; ZVFH-NEXT:    vmv1r.v v0, v7
+; ZVFH-NEXT:    vmv1r.v v0, v24
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFH-NEXT:    vfncvt.f.xu.w v16, v8, v0.t
-; ZVFH-NEXT:    vmv8r.v v8, v16
-; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 3
-; ZVFH-NEXT:    add sp, sp, a0
-; ZVFH-NEXT:    .cfi_def_cfa sp, 16
-; ZVFH-NEXT:    addi sp, sp, 16
-; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    vfncvt.f.xu.w v24, v8, v0.t
+; ZVFH-NEXT:    vmv8r.v v8, v24
 ; ZVFH-NEXT:    ret
 ;
 ; ZVFHMIN-LABEL: vuitofp_nxv32f16_nxv32i32:
@@ -543,9 +527,9 @@ define <vscale x 32 x half> @vuitofp_nxv32f16_nxv32i32(<vscale x 32 x i32> %va,
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    srli a2, a1, 2
+; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
-; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
 ; ZVFHMIN-NEXT:    sltu a3, a0, a2
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
@@ -577,9 +561,9 @@ define <vscale x 32 x float> @vuitofp_nxv32f32_nxv32i32(<vscale x 32 x i32> %va,
 ; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
index 336d86d57f3e62..ddc27f7562cdb1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
@@ -1377,9 +1377,9 @@ define <vscale x 1 x i64> @i1_zext(<vscale x 1 x i1> %va, <vscale x 1 x i64> %vb
 ;
 ; RV64-LABEL: i1_zext:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
-; RV64-NEXT:    vadd.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    li a1, 42
+; RV64-NEXT:    vsetvli a2, zero, e64, m1, ta, mu
+; RV64-NEXT:    vadd.vi v8, v8, 1, v0.t
 ; RV64-NEXT:    sh a1, 0(a0)
 ; RV64-NEXT:    ret
   %vc = zext <vscale x 1 x i1> %va to <vscale x 1 x i64>

diff  --git a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
index 3b5541c1a2440c..10e655c8445409 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vzext-vp.ll
@@ -154,9 +154,9 @@ define <vscale x 32 x i32> @vzext_nxv32i8_nxv32i32(<vscale x 32 x i8> %a, <vscal
 ; CHECK-NEXT:    vmv1r.v v12, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    sltu a3, a0, a2
 ; CHECK-NEXT:    addi a3, a3, -1

diff  --git a/llvm/test/CodeGen/RISCV/sadd_sat.ll b/llvm/test/CodeGen/RISCV/sadd_sat.ll
index 3dbaaeaf02b6c2..ab03ccc4ba590a 100644
--- a/llvm/test/CodeGen/RISCV/sadd_sat.ll
+++ b/llvm/test/CodeGen/RISCV/sadd_sat.ll
@@ -58,12 +58,12 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:    add a5, a4, a3
 ; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    xor a3, a4, a3
 ; RV32I-NEXT:    sltu a1, a0, a1
 ; RV32I-NEXT:    add a1, a5, a1
-; RV32I-NEXT:    xor a2, a4, a1
-; RV32I-NEXT:    xor a3, a4, a3
-; RV32I-NEXT:    not a3, a3
-; RV32I-NEXT:    and a2, a3, a2
+; RV32I-NEXT:    xor a4, a4, a1
+; RV32I-NEXT:    not a2, a3
+; RV32I-NEXT:    and a2, a2, a4
 ; RV32I-NEXT:    bgez a2, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    srai a0, a1, 31

diff  --git a/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll b/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
index 648ec8b30d3897..abcf3379d0a6ed 100644
--- a/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
@@ -44,11 +44,11 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    sext.w a0, a0
 ; RV64IZbb-NEXT:    mulw a1, a1, a2
+; RV64IZbb-NEXT:    lui a2, 524288
 ; RV64IZbb-NEXT:    add a0, a0, a1
-; RV64IZbb-NEXT:    lui a1, 524288
-; RV64IZbb-NEXT:    addiw a2, a1, -1
-; RV64IZbb-NEXT:    min a0, a0, a2
-; RV64IZbb-NEXT:    max a0, a0, a1
+; RV64IZbb-NEXT:    addiw a1, a2, -1
+; RV64IZbb-NEXT:    min a0, a0, a1
+; RV64IZbb-NEXT:    max a0, a0, a2
 ; RV64IZbb-NEXT:    ret
   %a = mul i32 %y, %z
   %tmp = call i32 @llvm.sadd.sat.i32(i32 %x, i32 %a)
@@ -62,12 +62,12 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:    add a3, a2, a5
 ; RV32I-NEXT:    add a0, a0, a4
+; RV32I-NEXT:    xor a5, a2, a5
 ; RV32I-NEXT:    sltu a1, a0, a1
 ; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    xor a3, a2, a1
-; RV32I-NEXT:    xor a2, a2, a5
-; RV32I-NEXT:    not a2, a2
-; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    xor a2, a2, a1
+; RV32I-NEXT:    not a3, a5
+; RV32I-NEXT:    and a2, a3, a2
 ; RV32I-NEXT:    bgez a2, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    srai a0, a1, 31
@@ -118,13 +118,13 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; RV32I-LABEL: func16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    mul a1, a1, a2
+; RV32I-NEXT:    lui a2, 8
+; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    slli a1, a1, 16
 ; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lui a1, 8
-; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    addi a1, a2, -1
 ; RV32I-NEXT:    bge a0, a1, .LBB2_3
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    lui a1, 1048568
@@ -142,13 +142,13 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; RV64I-LABEL: func16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    lui a2, 8
+; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    slli a1, a1, 48
 ; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    lui a1, 8
-; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    addiw a1, a2, -1
 ; RV64I-NEXT:    bge a0, a1, .LBB2_3
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    lui a1, 1048568
@@ -167,11 +167,11 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; RV32IZbb:       # %bb.0:
 ; RV32IZbb-NEXT:    sext.h a0, a0
 ; RV32IZbb-NEXT:    mul a1, a1, a2
+; RV32IZbb-NEXT:    lui a2, 8
 ; RV32IZbb-NEXT:    sext.h a1, a1
+; RV32IZbb-NEXT:    addi a2, a2, -1
 ; RV32IZbb-NEXT:    add a0, a0, a1
-; RV32IZbb-NEXT:    lui a1, 8
-; RV32IZbb-NEXT:    addi a1, a1, -1
-; RV32IZbb-NEXT:    min a0, a0, a1
+; RV32IZbb-NEXT:    min a0, a0, a2
 ; RV32IZbb-NEXT:    lui a1, 1048568
 ; RV32IZbb-NEXT:    max a0, a0, a1
 ; RV32IZbb-NEXT:    ret
@@ -180,11 +180,11 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    sext.h a0, a0
 ; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    lui a2, 8
 ; RV64IZbb-NEXT:    sext.h a1, a1
+; RV64IZbb-NEXT:    addiw a2, a2, -1
 ; RV64IZbb-NEXT:    add a0, a0, a1
-; RV64IZbb-NEXT:    lui a1, 8
-; RV64IZbb-NEXT:    addiw a1, a1, -1
-; RV64IZbb-NEXT:    min a0, a0, a1
+; RV64IZbb-NEXT:    min a0, a0, a2
 ; RV64IZbb-NEXT:    lui a1, 1048568
 ; RV64IZbb-NEXT:    max a0, a0, a1
 ; RV64IZbb-NEXT:    ret
@@ -197,8 +197,8 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 ; RV32I-LABEL: func8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    mul a1, a1, a2
+; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 24
 ; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    add a0, a0, a1
@@ -220,8 +220,8 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 ; RV64I-LABEL: func8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    slli a1, a1, 56
 ; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    add a0, a0, a1
@@ -244,10 +244,10 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 ; RV32IZbb:       # %bb.0:
 ; RV32IZbb-NEXT:    sext.b a0, a0
 ; RV32IZbb-NEXT:    mul a1, a1, a2
+; RV32IZbb-NEXT:    li a2, 127
 ; RV32IZbb-NEXT:    sext.b a1, a1
 ; RV32IZbb-NEXT:    add a0, a0, a1
-; RV32IZbb-NEXT:    li a1, 127
-; RV32IZbb-NEXT:    min a0, a0, a1
+; RV32IZbb-NEXT:    min a0, a0, a2
 ; RV32IZbb-NEXT:    li a1, -128
 ; RV32IZbb-NEXT:    max a0, a0, a1
 ; RV32IZbb-NEXT:    ret
@@ -256,10 +256,10 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    sext.b a0, a0
 ; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    li a2, 127
 ; RV64IZbb-NEXT:    sext.b a1, a1
 ; RV64IZbb-NEXT:    add a0, a0, a1
-; RV64IZbb-NEXT:    li a1, 127
-; RV64IZbb-NEXT:    min a0, a0, a1
+; RV64IZbb-NEXT:    min a0, a0, a2
 ; RV64IZbb-NEXT:    li a1, -128
 ; RV64IZbb-NEXT:    max a0, a0, a1
 ; RV64IZbb-NEXT:    ret
@@ -272,8 +272,8 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; RV32I-LABEL: func4:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 28
-; RV32I-NEXT:    srai a0, a0, 28
 ; RV32I-NEXT:    mul a1, a1, a2
+; RV32I-NEXT:    srai a0, a0, 28
 ; RV32I-NEXT:    slli a1, a1, 28
 ; RV32I-NEXT:    srai a1, a1, 28
 ; RV32I-NEXT:    add a0, a0, a1
@@ -295,8 +295,8 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; RV64I-LABEL: func4:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 60
-; RV64I-NEXT:    srai a0, a0, 60
 ; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    srai a0, a0, 60
 ; RV64I-NEXT:    slli a1, a1, 60
 ; RV64I-NEXT:    srai a1, a1, 60
 ; RV64I-NEXT:    add a0, a0, a1
@@ -318,13 +318,13 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; RV32IZbb-LABEL: func4:
 ; RV32IZbb:       # %bb.0:
 ; RV32IZbb-NEXT:    slli a0, a0, 28
-; RV32IZbb-NEXT:    srai a0, a0, 28
 ; RV32IZbb-NEXT:    mul a1, a1, a2
+; RV32IZbb-NEXT:    li a2, 7
+; RV32IZbb-NEXT:    srai a0, a0, 28
 ; RV32IZbb-NEXT:    slli a1, a1, 28
 ; RV32IZbb-NEXT:    srai a1, a1, 28
 ; RV32IZbb-NEXT:    add a0, a0, a1
-; RV32IZbb-NEXT:    li a1, 7
-; RV32IZbb-NEXT:    min a0, a0, a1
+; RV32IZbb-NEXT:    min a0, a0, a2
 ; RV32IZbb-NEXT:    li a1, -8
 ; RV32IZbb-NEXT:    max a0, a0, a1
 ; RV32IZbb-NEXT:    ret
@@ -332,13 +332,13 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; RV64IZbb-LABEL: func4:
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    slli a0, a0, 60
-; RV64IZbb-NEXT:    srai a0, a0, 60
 ; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    li a2, 7
+; RV64IZbb-NEXT:    srai a0, a0, 60
 ; RV64IZbb-NEXT:    slli a1, a1, 60
 ; RV64IZbb-NEXT:    srai a1, a1, 60
 ; RV64IZbb-NEXT:    add a0, a0, a1
-; RV64IZbb-NEXT:    li a1, 7
-; RV64IZbb-NEXT:    min a0, a0, a1
+; RV64IZbb-NEXT:    min a0, a0, a2
 ; RV64IZbb-NEXT:    li a1, -8
 ; RV64IZbb-NEXT:    max a0, a0, a1
 ; RV64IZbb-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/select-binop-identity.ll b/llvm/test/CodeGen/RISCV/select-binop-identity.ll
index 83bb7f19fa2b05..325e4b54c1d6d8 100644
--- a/llvm/test/CodeGen/RISCV/select-binop-identity.ll
+++ b/llvm/test/CodeGen/RISCV/select-binop-identity.ll
@@ -301,8 +301,8 @@ define i64 @and_select_all_ones_i64_cmp2(i64 %x, i64 %y, i64 %z) {
 ; ZICOND32-LABEL: and_select_all_ones_i64_cmp2:
 ; ZICOND32:       # %bb.0:
 ; ZICOND32-NEXT:    slti a6, a5, 0
-; ZICOND32-NEXT:    czero.eqz a6, a6, a5
 ; ZICOND32-NEXT:    sltiu a4, a4, 4
+; ZICOND32-NEXT:    czero.eqz a6, a6, a5
 ; ZICOND32-NEXT:    czero.nez a4, a4, a5
 ; ZICOND32-NEXT:    or a4, a4, a6
 ; ZICOND32-NEXT:    addi a4, a4, -1

diff  --git a/llvm/test/CodeGen/RISCV/select-const.ll b/llvm/test/CodeGen/RISCV/select-const.ll
index 792df6236ddc0e..96081fc462d6f5 100644
--- a/llvm/test/CodeGen/RISCV/select-const.ll
+++ b/llvm/test/CodeGen/RISCV/select-const.ll
@@ -348,8 +348,8 @@ define i32 @select_eq_10000_10001(i32 signext %a, i32 signext %b) {
 ; RV32-LABEL: select_eq_10000_10001:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    xor a0, a0, a1
-; RV32-NEXT:    seqz a0, a0
 ; RV32-NEXT:    lui a1, 2
+; RV32-NEXT:    seqz a0, a0
 ; RV32-NEXT:    addi a1, a1, 1810
 ; RV32-NEXT:    sub a0, a1, a0
 ; RV32-NEXT:    ret
@@ -357,8 +357,8 @@ define i32 @select_eq_10000_10001(i32 signext %a, i32 signext %b) {
 ; RV64-LABEL: select_eq_10000_10001:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    xor a0, a0, a1
-; RV64-NEXT:    seqz a0, a0
 ; RV64-NEXT:    lui a1, 2
+; RV64-NEXT:    seqz a0, a0
 ; RV64-NEXT:    addiw a1, a1, 1810
 ; RV64-NEXT:    sub a0, a1, a0
 ; RV64-NEXT:    ret
@@ -371,8 +371,8 @@ define i32 @select_ne_10001_10002(i32 signext %a, i32 signext %b) {
 ; RV32-LABEL: select_ne_10001_10002:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    xor a0, a0, a1
-; RV32-NEXT:    snez a0, a0
 ; RV32-NEXT:    lui a1, 2
+; RV32-NEXT:    snez a0, a0
 ; RV32-NEXT:    addi a1, a1, 1810
 ; RV32-NEXT:    sub a0, a1, a0
 ; RV32-NEXT:    ret
@@ -380,8 +380,8 @@ define i32 @select_ne_10001_10002(i32 signext %a, i32 signext %b) {
 ; RV64-LABEL: select_ne_10001_10002:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    xor a0, a0, a1
-; RV64-NEXT:    snez a0, a0
 ; RV64-NEXT:    lui a1, 2
+; RV64-NEXT:    snez a0, a0
 ; RV64-NEXT:    addiw a1, a1, 1810
 ; RV64-NEXT:    sub a0, a1, a0
 ; RV64-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll
index 8aa50cc0f39c1a..252cf776299b30 100644
--- a/llvm/test/CodeGen/RISCV/select.ll
+++ b/llvm/test/CodeGen/RISCV/select.ll
@@ -1496,8 +1496,8 @@ define i32 @select_cst_not4(i32 signext %a, i32 signext %b) {
 ; RV64IM-LABEL: select_cst_not4:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slt a0, a0, a1
-; RV64IM-NEXT:    neg a0, a0
 ; RV64IM-NEXT:    lui a1, 524288
+; RV64IM-NEXT:    neg a0, a0
 ; RV64IM-NEXT:    addiw a1, a1, -1
 ; RV64IM-NEXT:    xor a0, a0, a1
 ; RV64IM-NEXT:    ret
@@ -1505,8 +1505,8 @@ define i32 @select_cst_not4(i32 signext %a, i32 signext %b) {
 ; RV64IMXVTCONDOPS-LABEL: select_cst_not4:
 ; RV64IMXVTCONDOPS:       # %bb.0:
 ; RV64IMXVTCONDOPS-NEXT:    slt a0, a0, a1
-; RV64IMXVTCONDOPS-NEXT:    neg a0, a0
 ; RV64IMXVTCONDOPS-NEXT:    lui a1, 524288
+; RV64IMXVTCONDOPS-NEXT:    neg a0, a0
 ; RV64IMXVTCONDOPS-NEXT:    addiw a1, a1, -1
 ; RV64IMXVTCONDOPS-NEXT:    xor a0, a0, a1
 ; RV64IMXVTCONDOPS-NEXT:    ret
@@ -1522,8 +1522,8 @@ define i32 @select_cst_not4(i32 signext %a, i32 signext %b) {
 ; RV64IMZICOND-LABEL: select_cst_not4:
 ; RV64IMZICOND:       # %bb.0:
 ; RV64IMZICOND-NEXT:    slt a0, a0, a1
-; RV64IMZICOND-NEXT:    neg a0, a0
 ; RV64IMZICOND-NEXT:    lui a1, 524288
+; RV64IMZICOND-NEXT:    neg a0, a0
 ; RV64IMZICOND-NEXT:    addiw a1, a1, -1
 ; RV64IMZICOND-NEXT:    xor a0, a0, a1
 ; RV64IMZICOND-NEXT:    ret
@@ -1536,8 +1536,8 @@ define i32 @select_cst_not5(i32 signext %a, i32 signext %b) {
 ; RV32IM-LABEL: select_cst_not5:
 ; RV32IM:       # %bb.0:
 ; RV32IM-NEXT:    slt a0, a0, a1
-; RV32IM-NEXT:    neg a0, a0
 ; RV32IM-NEXT:    lui a1, 16
+; RV32IM-NEXT:    neg a0, a0
 ; RV32IM-NEXT:    addi a1, a1, -5
 ; RV32IM-NEXT:    xor a0, a0, a1
 ; RV32IM-NEXT:    ret
@@ -1545,8 +1545,8 @@ define i32 @select_cst_not5(i32 signext %a, i32 signext %b) {
 ; RV64IM-LABEL: select_cst_not5:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slt a0, a0, a1
-; RV64IM-NEXT:    neg a0, a0
 ; RV64IM-NEXT:    lui a1, 16
+; RV64IM-NEXT:    neg a0, a0
 ; RV64IM-NEXT:    addiw a1, a1, -5
 ; RV64IM-NEXT:    xor a0, a0, a1
 ; RV64IM-NEXT:    ret
@@ -1554,8 +1554,8 @@ define i32 @select_cst_not5(i32 signext %a, i32 signext %b) {
 ; RV64IMXVTCONDOPS-LABEL: select_cst_not5:
 ; RV64IMXVTCONDOPS:       # %bb.0:
 ; RV64IMXVTCONDOPS-NEXT:    slt a0, a0, a1
-; RV64IMXVTCONDOPS-NEXT:    neg a0, a0
 ; RV64IMXVTCONDOPS-NEXT:    lui a1, 16
+; RV64IMXVTCONDOPS-NEXT:    neg a0, a0
 ; RV64IMXVTCONDOPS-NEXT:    addiw a1, a1, -5
 ; RV64IMXVTCONDOPS-NEXT:    xor a0, a0, a1
 ; RV64IMXVTCONDOPS-NEXT:    ret
@@ -1563,8 +1563,8 @@ define i32 @select_cst_not5(i32 signext %a, i32 signext %b) {
 ; RV32IMZICOND-LABEL: select_cst_not5:
 ; RV32IMZICOND:       # %bb.0:
 ; RV32IMZICOND-NEXT:    slt a0, a0, a1
-; RV32IMZICOND-NEXT:    neg a0, a0
 ; RV32IMZICOND-NEXT:    lui a1, 16
+; RV32IMZICOND-NEXT:    neg a0, a0
 ; RV32IMZICOND-NEXT:    addi a1, a1, -5
 ; RV32IMZICOND-NEXT:    xor a0, a0, a1
 ; RV32IMZICOND-NEXT:    ret
@@ -1572,8 +1572,8 @@ define i32 @select_cst_not5(i32 signext %a, i32 signext %b) {
 ; RV64IMZICOND-LABEL: select_cst_not5:
 ; RV64IMZICOND:       # %bb.0:
 ; RV64IMZICOND-NEXT:    slt a0, a0, a1
-; RV64IMZICOND-NEXT:    neg a0, a0
 ; RV64IMZICOND-NEXT:    lui a1, 16
+; RV64IMZICOND-NEXT:    neg a0, a0
 ; RV64IMZICOND-NEXT:    addiw a1, a1, -5
 ; RV64IMZICOND-NEXT:    xor a0, a0, a1
 ; RV64IMZICOND-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/setcc-logic.ll b/llvm/test/CodeGen/RISCV/setcc-logic.ll
index 2b87026e98e6a8..fabb573e98d2af 100644
--- a/llvm/test/CodeGen/RISCV/setcc-logic.ll
+++ b/llvm/test/CodeGen/RISCV/setcc-logic.ll
@@ -96,8 +96,8 @@ define i1 @and_icmps_const_not1bit_
diff (i32 %x) nounwind {
 ; RV32I-LABEL: and_icmps_const_not1bit_
diff :
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a1, a0, -44
-; RV32I-NEXT:    snez a1, a1
 ; RV32I-NEXT:    addi a0, a0, -92
+; RV32I-NEXT:    snez a1, a1
 ; RV32I-NEXT:    snez a0, a0
 ; RV32I-NEXT:    and a0, a1, a0
 ; RV32I-NEXT:    ret
@@ -106,8 +106,8 @@ define i1 @and_icmps_const_not1bit_
diff (i32 %x) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    addi a1, a0, -44
-; RV64I-NEXT:    snez a1, a1
 ; RV64I-NEXT:    addi a0, a0, -92
+; RV64I-NEXT:    snez a1, a1
 ; RV64I-NEXT:    snez a0, a0
 ; RV64I-NEXT:    and a0, a1, a0
 ; RV64I-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll
index 8cf78551d28f98..11b0e5263e1122 100644
--- a/llvm/test/CodeGen/RISCV/sextw-removal.ll
+++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll
@@ -177,13 +177,13 @@ define void @test5(i32 signext %arg, i32 signext %arg1) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sraw a0, a0, a1
 ; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
 ; RV64I-NEXT:    addiw s0, a1, 1365
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw s1, a1, 819
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addi s2, a1, -241
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addi s3, a1, 257
+; RV64I-NEXT:    addiw s1, a2, 819
+; RV64I-NEXT:    addi s2, a3, -241
+; RV64I-NEXT:    addi s3, a4, 257
 ; RV64I-NEXT:  .LBB4_1: # %bb2
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64I-NEXT:    call bar
@@ -324,19 +324,19 @@ define void @test7(i32 signext %arg, i32 signext %arg1) nounwind {
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sraw a0, a0, a1
 ; RV64I-NEXT:    lui a1, 349525
+; RV64I-NEXT:    lui a2, 209715
+; RV64I-NEXT:    lui a3, 61681
+; RV64I-NEXT:    lui a4, 4112
 ; RV64I-NEXT:    addiw s0, a1, 1365
+; RV64I-NEXT:    addiw s1, a2, 819
+; RV64I-NEXT:    addiw s2, a3, -241
+; RV64I-NEXT:    addiw s3, a4, 257
 ; RV64I-NEXT:    slli a1, s0, 32
 ; RV64I-NEXT:    add s0, s0, a1
-; RV64I-NEXT:    lui a1, 209715
-; RV64I-NEXT:    addiw s1, a1, 819
 ; RV64I-NEXT:    slli a1, s1, 32
 ; RV64I-NEXT:    add s1, s1, a1
-; RV64I-NEXT:    lui a1, 61681
-; RV64I-NEXT:    addiw s2, a1, -241
 ; RV64I-NEXT:    slli a1, s2, 32
 ; RV64I-NEXT:    add s2, s2, a1
-; RV64I-NEXT:    lui a1, 4112
-; RV64I-NEXT:    addiw s3, a1, 257
 ; RV64I-NEXT:    slli a1, s3, 32
 ; RV64I-NEXT:    add s3, s3, a1
 ; RV64I-NEXT:  .LBB6_1: # %bb2
@@ -1037,36 +1037,36 @@ define signext i32 @bug(i32 signext %x) {
 ; CHECK-NEXT:    srliw a2, a0, 16
 ; CHECK-NEXT:    seqz a1, a2
 ; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sllw a1, a0, a1
-; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:    sllw a0, a0, a1
+; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    beqz a2, .LBB18_3
 ; CHECK-NEXT:  # %bb.2: # %if.end
-; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:  .LBB18_3: # %if.end
-; CHECK-NEXT:    srliw a2, a1, 24
+; CHECK-NEXT:    srliw a2, a0, 24
 ; CHECK-NEXT:    seqz a2, a2
 ; CHECK-NEXT:    slli a3, a2, 3
-; CHECK-NEXT:    sllw a1, a1, a3
 ; CHECK-NEXT:    negw a2, a2
+; CHECK-NEXT:    sllw a0, a0, a3
 ; CHECK-NEXT:    andi a2, a2, -8
-; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    srliw a2, a1, 28
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    srliw a2, a0, 28
 ; CHECK-NEXT:    seqz a2, a2
 ; CHECK-NEXT:    slli a3, a2, 2
-; CHECK-NEXT:    sllw a1, a1, a3
 ; CHECK-NEXT:    negw a2, a2
+; CHECK-NEXT:    sllw a0, a0, a3
 ; CHECK-NEXT:    andi a2, a2, -4
-; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    srliw a2, a1, 30
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    srliw a2, a0, 30
 ; CHECK-NEXT:    seqz a2, a2
 ; CHECK-NEXT:    slli a3, a2, 1
-; CHECK-NEXT:    sllw a1, a1, a3
 ; CHECK-NEXT:    negw a2, a2
+; CHECK-NEXT:    sllw a0, a0, a3
 ; CHECK-NEXT:    andi a2, a2, -2
-; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    not a1, a1
-; CHECK-NEXT:    srli a1, a1, 31
-; CHECK-NEXT:    addw a0, a0, a1
+; CHECK-NEXT:    add a1, a1, a2
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    srli a0, a0, 31
+; CHECK-NEXT:    addw a0, a1, a0
 ; CHECK-NEXT:  .LBB18_4: # %cleanup
 ; CHECK-NEXT:    ret
 ;
@@ -1077,36 +1077,36 @@ define signext i32 @bug(i32 signext %x) {
 ; NOREMOVAL-NEXT:    srliw a2, a0, 16
 ; NOREMOVAL-NEXT:    seqz a1, a2
 ; NOREMOVAL-NEXT:    slli a1, a1, 4
-; NOREMOVAL-NEXT:    sllw a1, a0, a1
-; NOREMOVAL-NEXT:    li a0, 16
+; NOREMOVAL-NEXT:    sllw a0, a0, a1
+; NOREMOVAL-NEXT:    li a1, 16
 ; NOREMOVAL-NEXT:    beqz a2, .LBB18_3
 ; NOREMOVAL-NEXT:  # %bb.2: # %if.end
-; NOREMOVAL-NEXT:    li a0, 32
+; NOREMOVAL-NEXT:    li a1, 32
 ; NOREMOVAL-NEXT:  .LBB18_3: # %if.end
-; NOREMOVAL-NEXT:    srliw a2, a1, 24
+; NOREMOVAL-NEXT:    srliw a2, a0, 24
 ; NOREMOVAL-NEXT:    seqz a2, a2
 ; NOREMOVAL-NEXT:    slli a3, a2, 3
-; NOREMOVAL-NEXT:    sllw a1, a1, a3
 ; NOREMOVAL-NEXT:    negw a2, a2
+; NOREMOVAL-NEXT:    sllw a0, a0, a3
 ; NOREMOVAL-NEXT:    andi a2, a2, -8
-; NOREMOVAL-NEXT:    add a0, a0, a2
-; NOREMOVAL-NEXT:    srliw a2, a1, 28
+; NOREMOVAL-NEXT:    add a1, a1, a2
+; NOREMOVAL-NEXT:    srliw a2, a0, 28
 ; NOREMOVAL-NEXT:    seqz a2, a2
 ; NOREMOVAL-NEXT:    slli a3, a2, 2
-; NOREMOVAL-NEXT:    sllw a1, a1, a3
 ; NOREMOVAL-NEXT:    negw a2, a2
+; NOREMOVAL-NEXT:    sllw a0, a0, a3
 ; NOREMOVAL-NEXT:    andi a2, a2, -4
-; NOREMOVAL-NEXT:    add a0, a0, a2
-; NOREMOVAL-NEXT:    srliw a2, a1, 30
+; NOREMOVAL-NEXT:    add a1, a1, a2
+; NOREMOVAL-NEXT:    srliw a2, a0, 30
 ; NOREMOVAL-NEXT:    seqz a2, a2
 ; NOREMOVAL-NEXT:    slli a3, a2, 1
-; NOREMOVAL-NEXT:    sllw a1, a1, a3
 ; NOREMOVAL-NEXT:    negw a2, a2
+; NOREMOVAL-NEXT:    sllw a0, a0, a3
 ; NOREMOVAL-NEXT:    andi a2, a2, -2
-; NOREMOVAL-NEXT:    add a0, a0, a2
-; NOREMOVAL-NEXT:    not a1, a1
-; NOREMOVAL-NEXT:    srli a1, a1, 31
-; NOREMOVAL-NEXT:    addw a0, a0, a1
+; NOREMOVAL-NEXT:    add a1, a1, a2
+; NOREMOVAL-NEXT:    not a0, a0
+; NOREMOVAL-NEXT:    srli a0, a0, 31
+; NOREMOVAL-NEXT:    addw a0, a1, a0
 ; NOREMOVAL-NEXT:  .LBB18_4: # %cleanup
 ; NOREMOVAL-NEXT:    ret
 entry:

diff  --git a/llvm/test/CodeGen/RISCV/shift-amount-mod.ll b/llvm/test/CodeGen/RISCV/shift-amount-mod.ll
index 66a8b85313e9bf..1e893d9baa4941 100644
--- a/llvm/test/CodeGen/RISCV/shift-amount-mod.ll
+++ b/llvm/test/CodeGen/RISCV/shift-amount-mod.ll
@@ -250,21 +250,21 @@ define i64 @ashr_by_masked_complemented_64(i64 %x) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    li a2, 63
 ; RV32I-NEXT:    sub a2, a2, a0
-; RV32I-NEXT:    andi a3, a2, 63
-; RV32I-NEXT:    addi a4, a3, -32
-; RV32I-NEXT:    mv a2, a1
-; RV32I-NEXT:    bltz a4, .LBB9_2
+; RV32I-NEXT:    andi a2, a2, 63
+; RV32I-NEXT:    addi a3, a2, -32
+; RV32I-NEXT:    bltz a3, .LBB9_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    srai a1, a2, 31
-; RV32I-NEXT:    sra a0, a2, a3
+; RV32I-NEXT:    mv a0, a1
+; RV32I-NEXT:    srai a1, a1, 31
+; RV32I-NEXT:    sra a0, a0, a2
 ; RV32I-NEXT:    ret
 ; RV32I-NEXT:  .LBB9_2:
-; RV32I-NEXT:    not a4, a0
-; RV32I-NEXT:    sra a1, a2, a4
-; RV32I-NEXT:    srl a0, a0, a4
-; RV32I-NEXT:    not a3, a3
-; RV32I-NEXT:    slli a2, a2, 1
-; RV32I-NEXT:    sll a2, a2, a3
+; RV32I-NEXT:    not a3, a0
+; RV32I-NEXT:    not a2, a2
+; RV32I-NEXT:    slli a4, a1, 1
+; RV32I-NEXT:    sra a1, a1, a3
+; RV32I-NEXT:    srl a0, a0, a3
+; RV32I-NEXT:    sll a2, a4, a2
 ; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/shift-and.ll b/llvm/test/CodeGen/RISCV/shift-and.ll
index 525ef624179c60..c9efeea980f5af 100644
--- a/llvm/test/CodeGen/RISCV/shift-and.ll
+++ b/llvm/test/CodeGen/RISCV/shift-and.ll
@@ -67,8 +67,8 @@ define i64 @test4(i64 %x) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a2, a1, 26
 ; RV32I-NEXT:    srli a0, a0, 6
-; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    srli a1, a1, 6
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    lui a2, 1048572
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    ret
@@ -105,8 +105,8 @@ define i64 @test6(i64 %x) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 26
 ; RV32I-NEXT:    slli a1, a1, 6
-; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    srli a0, a0, 10
+; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 16
 ; RV32I-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index 2bec1fca5c43b4..249dabba0cc283 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -161,37 +161,37 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    mv a6, sp
 ; RV32I-NEXT:    sw a3, 0(sp)
 ; RV32I-NEXT:    sw a4, 4(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
 ; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    srli a1, a2, 3
+; RV32I-NEXT:    andi a3, a2, 31
 ; RV32I-NEXT:    andi a1, a1, 12
-; RV32I-NEXT:    mv a3, sp
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a4, 4(a1)
-; RV32I-NEXT:    lw a5, 8(a1)
+; RV32I-NEXT:    xori a3, a3, 31
+; RV32I-NEXT:    add a1, a6, a1
+; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a5, 4(a1)
+; RV32I-NEXT:    lw a6, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    srl a3, a3, a2
-; RV32I-NEXT:    slli a6, a4, 1
-; RV32I-NEXT:    andi a7, a2, 31
-; RV32I-NEXT:    xori a7, a7, 31
-; RV32I-NEXT:    sll a6, a6, a7
-; RV32I-NEXT:    or a3, a3, a6
 ; RV32I-NEXT:    srl a4, a4, a2
-; RV32I-NEXT:    slli a6, a5, 1
-; RV32I-NEXT:    sll a6, a6, a7
-; RV32I-NEXT:    or a4, a4, a6
+; RV32I-NEXT:    slli a7, a5, 1
 ; RV32I-NEXT:    srl a5, a5, a2
-; RV32I-NEXT:    slli a6, a1, 1
-; RV32I-NEXT:    sll a6, a6, a7
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    srl a1, a1, a2
+; RV32I-NEXT:    slli t0, a6, 1
+; RV32I-NEXT:    srl a6, a6, a2
+; RV32I-NEXT:    srl a2, a1, a2
+; RV32I-NEXT:    slli a1, a1, 1
+; RV32I-NEXT:    sll a7, a7, a3
+; RV32I-NEXT:    sll t0, t0, a3
+; RV32I-NEXT:    sll a1, a1, a3
+; RV32I-NEXT:    or a3, a4, a7
+; RV32I-NEXT:    or a4, a5, t0
+; RV32I-NEXT:    or a1, a6, a1
 ; RV32I-NEXT:    sw a3, 0(a0)
 ; RV32I-NEXT:    sw a4, 4(a0)
-; RV32I-NEXT:    sw a5, 8(a0)
-; RV32I-NEXT:    sw a1, 12(a0)
+; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    sw a2, 12(a0)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
@@ -226,42 +226,42 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    lw a4, 4(a1)
 ; RV32I-NEXT:    lw a5, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    mv a6, sp
 ; RV32I-NEXT:    sw a3, 0(sp)
 ; RV32I-NEXT:    sw a4, 4(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
 ; RV32I-NEXT:    sw a1, 12(sp)
 ; RV32I-NEXT:    srai a1, a1, 31
+; RV32I-NEXT:    srli a3, a2, 3
+; RV32I-NEXT:    andi a4, a2, 31
 ; RV32I-NEXT:    sw a1, 16(sp)
 ; RV32I-NEXT:    sw a1, 20(sp)
 ; RV32I-NEXT:    sw a1, 24(sp)
 ; RV32I-NEXT:    sw a1, 28(sp)
-; RV32I-NEXT:    srli a1, a2, 3
-; RV32I-NEXT:    andi a1, a1, 12
-; RV32I-NEXT:    mv a3, sp
-; RV32I-NEXT:    add a1, a3, a1
-; RV32I-NEXT:    lw a3, 0(a1)
-; RV32I-NEXT:    lw a4, 4(a1)
-; RV32I-NEXT:    lw a5, 8(a1)
-; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    srl a3, a3, a2
-; RV32I-NEXT:    slli a6, a4, 1
-; RV32I-NEXT:    andi a7, a2, 31
-; RV32I-NEXT:    xori a7, a7, 31
-; RV32I-NEXT:    sll a6, a6, a7
-; RV32I-NEXT:    or a3, a3, a6
+; RV32I-NEXT:    andi a3, a3, 12
+; RV32I-NEXT:    xori a1, a4, 31
+; RV32I-NEXT:    add a3, a6, a3
+; RV32I-NEXT:    lw a4, 0(a3)
+; RV32I-NEXT:    lw a5, 4(a3)
+; RV32I-NEXT:    lw a6, 8(a3)
+; RV32I-NEXT:    lw a3, 12(a3)
 ; RV32I-NEXT:    srl a4, a4, a2
-; RV32I-NEXT:    slli a6, a5, 1
-; RV32I-NEXT:    sll a6, a6, a7
-; RV32I-NEXT:    or a4, a4, a6
+; RV32I-NEXT:    slli a7, a5, 1
 ; RV32I-NEXT:    srl a5, a5, a2
-; RV32I-NEXT:    slli a6, a1, 1
-; RV32I-NEXT:    sll a6, a6, a7
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    sra a1, a1, a2
+; RV32I-NEXT:    slli t0, a6, 1
+; RV32I-NEXT:    srl a6, a6, a2
+; RV32I-NEXT:    sra a2, a3, a2
+; RV32I-NEXT:    slli a3, a3, 1
+; RV32I-NEXT:    sll a7, a7, a1
+; RV32I-NEXT:    sll t0, t0, a1
+; RV32I-NEXT:    sll a1, a3, a1
+; RV32I-NEXT:    or a3, a4, a7
+; RV32I-NEXT:    or a4, a5, t0
+; RV32I-NEXT:    or a1, a6, a1
 ; RV32I-NEXT:    sw a3, 0(a0)
 ; RV32I-NEXT:    sw a4, 4(a0)
-; RV32I-NEXT:    sw a5, 8(a0)
-; RV32I-NEXT:    sw a1, 12(a0)
+; RV32I-NEXT:    sw a1, 8(a0)
+; RV32I-NEXT:    sw a2, 12(a0)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
@@ -300,37 +300,37 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    addi a6, sp, 16
 ; RV32I-NEXT:    sw a3, 16(sp)
 ; RV32I-NEXT:    sw a4, 20(sp)
 ; RV32I-NEXT:    sw a5, 24(sp)
 ; RV32I-NEXT:    sw a1, 28(sp)
 ; RV32I-NEXT:    srli a1, a2, 3
+; RV32I-NEXT:    andi a3, a2, 31
 ; RV32I-NEXT:    andi a1, a1, 12
-; RV32I-NEXT:    addi a3, sp, 16
-; RV32I-NEXT:    sub a3, a3, a1
-; RV32I-NEXT:    lw a1, 4(a3)
-; RV32I-NEXT:    lw a4, 0(a3)
-; RV32I-NEXT:    lw a5, 8(a3)
-; RV32I-NEXT:    lw a3, 12(a3)
-; RV32I-NEXT:    sll a6, a1, a2
-; RV32I-NEXT:    srli a7, a4, 1
-; RV32I-NEXT:    andi t0, a2, 31
-; RV32I-NEXT:    xori t0, t0, 31
-; RV32I-NEXT:    srl a7, a7, t0
-; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    sub a1, a6, a1
+; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a5, 4(a1)
+; RV32I-NEXT:    lw a6, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    xori a3, a3, 31
 ; RV32I-NEXT:    sll a7, a5, a2
-; RV32I-NEXT:    srli a1, a1, 1
-; RV32I-NEXT:    srl a1, a1, t0
-; RV32I-NEXT:    or a1, a7, a1
-; RV32I-NEXT:    sll a3, a3, a2
+; RV32I-NEXT:    srli t0, a4, 1
+; RV32I-NEXT:    sll a1, a1, a2
+; RV32I-NEXT:    sll a4, a4, a2
+; RV32I-NEXT:    sll a2, a6, a2
 ; RV32I-NEXT:    srli a5, a5, 1
-; RV32I-NEXT:    srl a5, a5, t0
-; RV32I-NEXT:    or a3, a3, a5
-; RV32I-NEXT:    sll a2, a4, a2
-; RV32I-NEXT:    sw a2, 0(a0)
+; RV32I-NEXT:    srli a6, a6, 1
+; RV32I-NEXT:    srl t0, t0, a3
+; RV32I-NEXT:    srl a5, a5, a3
+; RV32I-NEXT:    srl a3, a6, a3
+; RV32I-NEXT:    or a6, a7, t0
+; RV32I-NEXT:    or a2, a2, a5
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    sw a4, 0(a0)
 ; RV32I-NEXT:    sw a6, 4(a0)
-; RV32I-NEXT:    sw a1, 8(a0)
-; RV32I-NEXT:    sw a3, 12(a0)
+; RV32I-NEXT:    sw a2, 8(a0)
+; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
@@ -359,24 +359,24 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 define i64 @fshr64_minsize(i64 %a, i64 %b) minsize nounwind {
 ; RV32I-LABEL: fshr64_minsize:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    andi a4, a2, 32
+; RV32I-NEXT:    andi a5, a2, 32
 ; RV32I-NEXT:    mv a3, a0
-; RV32I-NEXT:    beqz a4, .LBB9_2
+; RV32I-NEXT:    beqz a5, .LBB9_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a3, a1
 ; RV32I-NEXT:  .LBB9_2:
-; RV32I-NEXT:    srl a5, a3, a2
-; RV32I-NEXT:    beqz a4, .LBB9_4
+; RV32I-NEXT:    srl a4, a3, a2
+; RV32I-NEXT:    beqz a5, .LBB9_4
 ; RV32I-NEXT:  # %bb.3:
 ; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:  .LBB9_4:
 ; RV32I-NEXT:    slli a0, a1, 1
-; RV32I-NEXT:    not a4, a2
-; RV32I-NEXT:    sll a0, a0, a4
-; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:    not a5, a2
 ; RV32I-NEXT:    srl a1, a1, a2
 ; RV32I-NEXT:    slli a3, a3, 1
-; RV32I-NEXT:    sll a2, a3, a4
+; RV32I-NEXT:    sll a0, a0, a5
+; RV32I-NEXT:    sll a2, a3, a5
+; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:    or a1, a2, a1
 ; RV32I-NEXT:    ret
 ;
@@ -395,90 +395,90 @@ define i128 @fshr128_minsize(i128 %a, i128 %b) minsize nounwind {
 ; RV32I-LABEL: fshr128_minsize:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a2, 0(a2)
-; RV32I-NEXT:    lw t2, 0(a1)
+; RV32I-NEXT:    lw t1, 0(a1)
 ; RV32I-NEXT:    lw a7, 4(a1)
-; RV32I-NEXT:    lw a3, 8(a1)
+; RV32I-NEXT:    lw a4, 8(a1)
 ; RV32I-NEXT:    lw a1, 12(a1)
-; RV32I-NEXT:    andi t1, a2, 64
+; RV32I-NEXT:    andi t2, a2, 64
 ; RV32I-NEXT:    mv t0, a7
-; RV32I-NEXT:    mv a4, t2
-; RV32I-NEXT:    beqz t1, .LBB10_2
+; RV32I-NEXT:    mv a3, t1
+; RV32I-NEXT:    beqz t2, .LBB10_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv t0, a1
-; RV32I-NEXT:    mv a4, a3
+; RV32I-NEXT:    mv a3, a4
 ; RV32I-NEXT:  .LBB10_2:
 ; RV32I-NEXT:    andi a6, a2, 32
-; RV32I-NEXT:    mv a5, a4
+; RV32I-NEXT:    mv a5, a3
 ; RV32I-NEXT:    bnez a6, .LBB10_13
 ; RV32I-NEXT:  # %bb.3:
-; RV32I-NEXT:    bnez t1, .LBB10_14
+; RV32I-NEXT:    bnez t2, .LBB10_14
 ; RV32I-NEXT:  .LBB10_4:
 ; RV32I-NEXT:    beqz a6, .LBB10_6
 ; RV32I-NEXT:  .LBB10_5:
-; RV32I-NEXT:    mv t0, a3
+; RV32I-NEXT:    mv t0, a4
 ; RV32I-NEXT:  .LBB10_6:
 ; RV32I-NEXT:    slli t3, t0, 1
-; RV32I-NEXT:    not t2, a2
-; RV32I-NEXT:    beqz t1, .LBB10_8
+; RV32I-NEXT:    not t1, a2
+; RV32I-NEXT:    beqz t2, .LBB10_8
 ; RV32I-NEXT:  # %bb.7:
 ; RV32I-NEXT:    mv a1, a7
 ; RV32I-NEXT:  .LBB10_8:
 ; RV32I-NEXT:    srl a7, a5, a2
-; RV32I-NEXT:    sll t1, t3, t2
+; RV32I-NEXT:    sll t2, t3, t1
 ; RV32I-NEXT:    srl t0, t0, a2
 ; RV32I-NEXT:    beqz a6, .LBB10_10
 ; RV32I-NEXT:  # %bb.9:
-; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    mv a4, a1
 ; RV32I-NEXT:  .LBB10_10:
-; RV32I-NEXT:    or a7, t1, a7
-; RV32I-NEXT:    slli t1, a3, 1
-; RV32I-NEXT:    sll t1, t1, t2
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    srl a3, a3, a2
+; RV32I-NEXT:    or a7, t2, a7
+; RV32I-NEXT:    slli t2, a4, 1
+; RV32I-NEXT:    sll t2, t2, t1
+; RV32I-NEXT:    or t0, t2, t0
+; RV32I-NEXT:    srl a4, a4, a2
 ; RV32I-NEXT:    beqz a6, .LBB10_12
 ; RV32I-NEXT:  # %bb.11:
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a1, a3
 ; RV32I-NEXT:  .LBB10_12:
-; RV32I-NEXT:    slli a4, a1, 1
-; RV32I-NEXT:    sll a4, a4, t2
-; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    slli a3, a1, 1
 ; RV32I-NEXT:    srl a1, a1, a2
 ; RV32I-NEXT:    slli a5, a5, 1
-; RV32I-NEXT:    sll a2, a5, t2
-; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    sll a2, a3, t1
+; RV32I-NEXT:    sll a3, a5, t1
+; RV32I-NEXT:    or a2, a2, a4
+; RV32I-NEXT:    or a1, a3, a1
 ; RV32I-NEXT:    sw a7, 0(a0)
 ; RV32I-NEXT:    sw t0, 4(a0)
-; RV32I-NEXT:    sw a3, 8(a0)
+; RV32I-NEXT:    sw a2, 8(a0)
 ; RV32I-NEXT:    sw a1, 12(a0)
 ; RV32I-NEXT:    ret
 ; RV32I-NEXT:  .LBB10_13:
 ; RV32I-NEXT:    mv a5, t0
-; RV32I-NEXT:    beqz t1, .LBB10_4
+; RV32I-NEXT:    beqz t2, .LBB10_4
 ; RV32I-NEXT:  .LBB10_14:
-; RV32I-NEXT:    mv a3, t2
+; RV32I-NEXT:    mv a4, t1
 ; RV32I-NEXT:    bnez a6, .LBB10_5
 ; RV32I-NEXT:    j .LBB10_6
 ;
 ; RV64I-LABEL: fshr128_minsize:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    andi a4, a2, 64
+; RV64I-NEXT:    andi a5, a2, 64
 ; RV64I-NEXT:    mv a3, a0
-; RV64I-NEXT:    beqz a4, .LBB10_2
+; RV64I-NEXT:    beqz a5, .LBB10_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a3, a1
 ; RV64I-NEXT:  .LBB10_2:
-; RV64I-NEXT:    srl a5, a3, a2
-; RV64I-NEXT:    beqz a4, .LBB10_4
+; RV64I-NEXT:    srl a4, a3, a2
+; RV64I-NEXT:    beqz a5, .LBB10_4
 ; RV64I-NEXT:  # %bb.3:
 ; RV64I-NEXT:    mv a1, a0
 ; RV64I-NEXT:  .LBB10_4:
 ; RV64I-NEXT:    slli a0, a1, 1
-; RV64I-NEXT:    not a4, a2
-; RV64I-NEXT:    sll a0, a0, a4
-; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    not a5, a2
 ; RV64I-NEXT:    srl a1, a1, a2
 ; RV64I-NEXT:    slli a3, a3, 1
-; RV64I-NEXT:    sll a2, a3, a4
+; RV64I-NEXT:    sll a0, a0, a5
+; RV64I-NEXT:    sll a2, a3, a5
+; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    ret
   %res = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 %b)

diff  --git a/llvm/test/CodeGen/RISCV/shl-cttz.ll b/llvm/test/CodeGen/RISCV/shl-cttz.ll
index 64be997c191be2..500673cc29196f 100644
--- a/llvm/test/CodeGen/RISCV/shl-cttz.ll
+++ b/llvm/test/CodeGen/RISCV/shl-cttz.ll
@@ -125,18 +125,18 @@ define i16 @shl_cttz_i16(i16 %x, i16 %y) {
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    addi a2, a1, -1
 ; RV32I-NEXT:    not a1, a1
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    srli a2, a1, 1
 ; RV32I-NEXT:    lui a3, 5
-; RV32I-NEXT:    addi a3, a3, 1365
-; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    addi a2, a3, 1365
+; RV32I-NEXT:    srli a3, a1, 1
+; RV32I-NEXT:    and a2, a3, a2
+; RV32I-NEXT:    lui a3, 3
+; RV32I-NEXT:    addi a3, a3, 819
 ; RV32I-NEXT:    sub a1, a1, a2
-; RV32I-NEXT:    lui a2, 3
-; RV32I-NEXT:    addi a2, a2, 819
-; RV32I-NEXT:    and a3, a1, a2
+; RV32I-NEXT:    and a2, a1, a3
 ; RV32I-NEXT:    srli a1, a1, 2
-; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    add a1, a3, a1
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    add a1, a2, a1
 ; RV32I-NEXT:    srli a2, a1, 4
 ; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    andi a2, a1, 15
@@ -156,18 +156,18 @@ define i16 @shl_cttz_i16(i16 %x, i16 %y) {
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    addi a2, a1, -1
 ; RV64I-NEXT:    not a1, a1
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    srli a2, a1, 1
 ; RV64I-NEXT:    lui a3, 5
-; RV64I-NEXT:    addiw a3, a3, 1365
-; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    addiw a2, a3, 1365
+; RV64I-NEXT:    srli a3, a1, 1
+; RV64I-NEXT:    and a2, a3, a2
+; RV64I-NEXT:    lui a3, 3
+; RV64I-NEXT:    addiw a3, a3, 819
 ; RV64I-NEXT:    sub a1, a1, a2
-; RV64I-NEXT:    lui a2, 3
-; RV64I-NEXT:    addiw a2, a2, 819
-; RV64I-NEXT:    and a3, a1, a2
+; RV64I-NEXT:    and a2, a1, a3
 ; RV64I-NEXT:    srli a1, a1, 2
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    add a1, a3, a1
+; RV64I-NEXT:    and a1, a1, a3
+; RV64I-NEXT:    add a1, a2, a1
 ; RV64I-NEXT:    srli a2, a1, 4
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    andi a2, a1, 15
@@ -193,18 +193,18 @@ define i16 @shl_cttz_constant_i16(i16 %y) {
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    addi a1, a0, -1
 ; RV32I-NEXT:    not a0, a0
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 1
 ; RV32I-NEXT:    lui a2, 5
-; RV32I-NEXT:    addi a2, a2, 1365
-; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 3
-; RV32I-NEXT:    addi a1, a1, 819
-; RV32I-NEXT:    and a2, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
 ; RV32I-NEXT:    srli a0, a0, 2
-; RV32I-NEXT:    and a0, a0, a1
-; RV32I-NEXT:    add a0, a2, a0
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    srli a1, a0, 4
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    andi a1, a0, 15
@@ -226,18 +226,18 @@ define i16 @shl_cttz_constant_i16(i16 %y) {
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    addi a1, a0, -1
 ; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    lui a2, 5
-; RV64I-NEXT:    addiw a2, a2, 1365
-; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    addiw a1, a2, 1365
+; RV64I-NEXT:    srli a2, a0, 1
+; RV64I-NEXT:    and a1, a2, a1
+; RV64I-NEXT:    lui a2, 3
+; RV64I-NEXT:    addiw a2, a2, 819
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 3
-; RV64I-NEXT:    addiw a1, a1, 819
-; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    and a1, a0, a2
 ; RV64I-NEXT:    srli a0, a0, 2
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srli a1, a0, 4
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    andi a1, a0, 15

diff  --git a/llvm/test/CodeGen/RISCV/shlimm-addimm.ll b/llvm/test/CodeGen/RISCV/shlimm-addimm.ll
index ead71bcbe113c9..c842ba5da52082 100644
--- a/llvm/test/CodeGen/RISCV/shlimm-addimm.ll
+++ b/llvm/test/CodeGen/RISCV/shlimm-addimm.ll
@@ -48,10 +48,10 @@ define i64 @shl5_add1184_c(i64 %x) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 27
 ; RV32I-NEXT:    slli a1, a1, 5
+; RV32I-NEXT:    slli a3, a0, 5
 ; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    slli a2, a0, 5
-; RV32I-NEXT:    addi a0, a2, 1184
-; RV32I-NEXT:    sltu a2, a0, a2
+; RV32I-NEXT:    addi a0, a3, 1184
+; RV32I-NEXT:    sltu a2, a0, a3
 ; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
@@ -112,12 +112,12 @@ define i64 @shl5_add101024_c(i64 %x) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 27
 ; RV32I-NEXT:    slli a1, a1, 5
+; RV32I-NEXT:    slli a3, a0, 5
 ; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    slli a2, a0, 5
 ; RV32I-NEXT:    lui a0, 25
 ; RV32I-NEXT:    addi a0, a0, -1376
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    sltu a2, a0, a2
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    sltu a2, a0, a3
 ; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
@@ -180,12 +180,12 @@ define i64 @shl5_add47968_c(i64 %x) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 27
 ; RV32I-NEXT:    slli a1, a1, 5
+; RV32I-NEXT:    slli a3, a0, 5
 ; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    slli a2, a0, 5
 ; RV32I-NEXT:    lui a0, 12
 ; RV32I-NEXT:    addi a0, a0, -1184
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    sltu a2, a0, a2
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    sltu a2, a0, a3
 ; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
@@ -248,12 +248,12 @@ define i64 @shl5_add47969_c(i64 %x) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 27
 ; RV32I-NEXT:    slli a1, a1, 5
+; RV32I-NEXT:    slli a3, a0, 5
 ; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    slli a2, a0, 5
 ; RV32I-NEXT:    lui a0, 12
 ; RV32I-NEXT:    addi a0, a0, -1183
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    sltu a2, a0, a2
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    sltu a2, a0, a3
 ; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
@@ -316,12 +316,12 @@ define i64 @shl5_sub47968_c(i64 %x) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 27
 ; RV32I-NEXT:    slli a1, a1, 5
+; RV32I-NEXT:    slli a3, a0, 5
 ; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    slli a2, a0, 5
 ; RV32I-NEXT:    lui a0, 1048564
 ; RV32I-NEXT:    addi a0, a0, 1184
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    sltu a2, a0, a2
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    sltu a2, a0, a3
 ; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    ret
@@ -385,12 +385,12 @@ define i64 @shl5_sub47969_c(i64 %x) {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    srli a2, a0, 27
 ; RV32I-NEXT:    slli a1, a1, 5
+; RV32I-NEXT:    slli a3, a0, 5
 ; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    slli a2, a0, 5
 ; RV32I-NEXT:    lui a0, 1048564
 ; RV32I-NEXT:    addi a0, a0, 1183
-; RV32I-NEXT:    add a0, a2, a0
-; RV32I-NEXT:    sltu a2, a0, a2
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    sltu a2, a0, a3
 ; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    addi a1, a1, -1
 ; RV32I-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
index c0c11fefafb555..b7b88584f3bdb8 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
@@ -69,39 +69,39 @@ define signext i32 @test3(i32 signext %v, i32 signext %w, i32 signext %x, i32 si
 ;
 ; RV64SFB-LABEL: test3:
 ; RV64SFB:       # %bb.0:
-; RV64SFB-NEXT:    bnez a4, .LBB2_2
+; RV64SFB-NEXT:    beqz a4, .LBB2_2
 ; RV64SFB-NEXT:  # %bb.1:
-; RV64SFB-NEXT:    mv a0, a1
+; RV64SFB-NEXT:    mv a2, a3
 ; RV64SFB-NEXT:  .LBB2_2:
-; RV64SFB-NEXT:    beqz a4, .LBB2_4
+; RV64SFB-NEXT:    bnez a4, .LBB2_4
 ; RV64SFB-NEXT:  # %bb.3:
-; RV64SFB-NEXT:    mv a2, a3
+; RV64SFB-NEXT:    mv a0, a1
 ; RV64SFB-NEXT:  .LBB2_4:
 ; RV64SFB-NEXT:    addw a0, a0, a2
 ; RV64SFB-NEXT:    ret
 ;
 ; ZICOND-LABEL: test3:
 ; ZICOND:       # %bb.0:
-; ZICOND-NEXT:    bnez a4, .LBB2_2
+; ZICOND-NEXT:    beqz a4, .LBB2_2
 ; ZICOND-NEXT:  # %bb.1:
-; ZICOND-NEXT:    mv a0, a1
+; ZICOND-NEXT:    mv a2, a3
 ; ZICOND-NEXT:  .LBB2_2:
-; ZICOND-NEXT:    beqz a4, .LBB2_4
+; ZICOND-NEXT:    bnez a4, .LBB2_4
 ; ZICOND-NEXT:  # %bb.3:
-; ZICOND-NEXT:    mv a2, a3
+; ZICOND-NEXT:    mv a0, a1
 ; ZICOND-NEXT:  .LBB2_4:
 ; ZICOND-NEXT:    addw a0, a0, a2
 ; ZICOND-NEXT:    ret
 ;
 ; RV32SFB-LABEL: test3:
 ; RV32SFB:       # %bb.0:
-; RV32SFB-NEXT:    bnez a4, .LBB2_2
+; RV32SFB-NEXT:    beqz a4, .LBB2_2
 ; RV32SFB-NEXT:  # %bb.1:
-; RV32SFB-NEXT:    mv a0, a1
+; RV32SFB-NEXT:    mv a2, a3
 ; RV32SFB-NEXT:  .LBB2_2:
-; RV32SFB-NEXT:    beqz a4, .LBB2_4
+; RV32SFB-NEXT:    bnez a4, .LBB2_4
 ; RV32SFB-NEXT:  # %bb.3:
-; RV32SFB-NEXT:    mv a2, a3
+; RV32SFB-NEXT:    mv a0, a1
 ; RV32SFB-NEXT:  .LBB2_4:
 ; RV32SFB-NEXT:    add a0, a0, a2
 ; RV32SFB-NEXT:    ret
@@ -566,18 +566,18 @@ define void @sextw_removal_ccaddw(i1 %c, i32 signext %arg, i32 signext %arg1, i3
 ; RV64SFB-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; RV64SFB-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; RV64SFB-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64SFB-NEXT:    mv s0, a1
+; RV64SFB-NEXT:    mv s1, a1
 ; RV64SFB-NEXT:    andi a0, a0, 1
-; RV64SFB-NEXT:    mv s1, a2
+; RV64SFB-NEXT:    mv s0, a2
 ; RV64SFB-NEXT:    beqz a0, .LBB16_4
 ; RV64SFB-NEXT:  # %bb.3: # %bb
-; RV64SFB-NEXT:    addw s0, a1, a3
+; RV64SFB-NEXT:    addw s1, a1, a3
 ; RV64SFB-NEXT:  .LBB16_4: # %bb
 ; RV64SFB-NEXT:  .LBB16_1: # %bb2
 ; RV64SFB-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64SFB-NEXT:    mv a0, s0
+; RV64SFB-NEXT:    mv a0, s1
 ; RV64SFB-NEXT:    call bar
-; RV64SFB-NEXT:    sllw s0, s0, s1
+; RV64SFB-NEXT:    sllw s1, s1, s0
 ; RV64SFB-NEXT:    bnez a0, .LBB16_1
 ; RV64SFB-NEXT:  # %bb.2: # %bb7
 ; RV64SFB-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -592,18 +592,18 @@ define void @sextw_removal_ccaddw(i1 %c, i32 signext %arg, i32 signext %arg1, i3
 ; ZICOND-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
 ; ZICOND-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
 ; ZICOND-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; ZICOND-NEXT:    mv s0, a1
+; ZICOND-NEXT:    mv s1, a1
 ; ZICOND-NEXT:    andi a0, a0, 1
-; ZICOND-NEXT:    mv s1, a2
+; ZICOND-NEXT:    mv s0, a2
 ; ZICOND-NEXT:    beqz a0, .LBB16_4
 ; ZICOND-NEXT:  # %bb.3: # %bb
-; ZICOND-NEXT:    addw s0, a1, a3
+; ZICOND-NEXT:    addw s1, a1, a3
 ; ZICOND-NEXT:  .LBB16_4: # %bb
 ; ZICOND-NEXT:  .LBB16_1: # %bb2
 ; ZICOND-NEXT:    # =>This Inner Loop Header: Depth=1
-; ZICOND-NEXT:    mv a0, s0
+; ZICOND-NEXT:    mv a0, s1
 ; ZICOND-NEXT:    call bar
-; ZICOND-NEXT:    sllw s0, s0, s1
+; ZICOND-NEXT:    sllw s1, s1, s0
 ; ZICOND-NEXT:    bnez a0, .LBB16_1
 ; ZICOND-NEXT:  # %bb.2: # %bb7
 ; ZICOND-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -618,18 +618,18 @@ define void @sextw_removal_ccaddw(i1 %c, i32 signext %arg, i32 signext %arg1, i3
 ; RV32SFB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; RV32SFB-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
 ; RV32SFB-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32SFB-NEXT:    mv s0, a1
+; RV32SFB-NEXT:    mv s1, a1
 ; RV32SFB-NEXT:    andi a0, a0, 1
-; RV32SFB-NEXT:    mv s1, a2
+; RV32SFB-NEXT:    mv s0, a2
 ; RV32SFB-NEXT:    beqz a0, .LBB16_4
 ; RV32SFB-NEXT:  # %bb.3: # %bb
-; RV32SFB-NEXT:    add s0, a1, a3
+; RV32SFB-NEXT:    add s1, a1, a3
 ; RV32SFB-NEXT:  .LBB16_4: # %bb
 ; RV32SFB-NEXT:  .LBB16_1: # %bb2
 ; RV32SFB-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32SFB-NEXT:    mv a0, s0
+; RV32SFB-NEXT:    mv a0, s1
 ; RV32SFB-NEXT:    call bar
-; RV32SFB-NEXT:    sll s0, s0, s1
+; RV32SFB-NEXT:    sll s1, s1, s0
 ; RV32SFB-NEXT:    bnez a0, .LBB16_1
 ; RV32SFB-NEXT:  # %bb.2: # %bb7
 ; RV32SFB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -1223,14 +1223,14 @@ define i64 @select_slli(i64 %A, i64 %C, i1 zeroext %cond) {
 ; RV32SFB-LABEL: select_slli:
 ; RV32SFB:       # %bb.0: # %entry
 ; RV32SFB-NEXT:    mv a1, a0
-; RV32SFB-NEXT:    mv a0, a2
-; RV32SFB-NEXT:    beqz a4, .LBB28_2
+; RV32SFB-NEXT:    bnez a4, .LBB28_2
 ; RV32SFB-NEXT:  # %bb.1: # %entry
-; RV32SFB-NEXT:    mv a1, a3
+; RV32SFB-NEXT:    li a2, 0
 ; RV32SFB-NEXT:  .LBB28_2: # %entry
-; RV32SFB-NEXT:    bnez a4, .LBB28_4
+; RV32SFB-NEXT:    mv a0, a2
+; RV32SFB-NEXT:    beqz a4, .LBB28_4
 ; RV32SFB-NEXT:  # %bb.3: # %entry
-; RV32SFB-NEXT:    li a0, 0
+; RV32SFB-NEXT:    mv a1, a3
 ; RV32SFB-NEXT:  .LBB28_4: # %entry
 ; RV32SFB-NEXT:    ret
 entry:
@@ -1567,11 +1567,11 @@ define i64 @select_andn(i64 %A, i64 %B, i64 %C, i1 zeroext %cond) {
 ; RV32SFB:       # %bb.0: # %entry
 ; RV32SFB-NEXT:    bnez a6, .LBB36_2
 ; RV32SFB-NEXT:  # %bb.1: # %entry
-; RV32SFB-NEXT:    andn a4, a0, a2
+; RV32SFB-NEXT:    andn a5, a1, a3
 ; RV32SFB-NEXT:  .LBB36_2: # %entry
 ; RV32SFB-NEXT:    bnez a6, .LBB36_4
 ; RV32SFB-NEXT:  # %bb.3: # %entry
-; RV32SFB-NEXT:    andn a5, a1, a3
+; RV32SFB-NEXT:    andn a4, a0, a2
 ; RV32SFB-NEXT:  .LBB36_4: # %entry
 ; RV32SFB-NEXT:    mv a0, a4
 ; RV32SFB-NEXT:    mv a1, a5
@@ -1615,11 +1615,11 @@ define i64 @select_orn(i64 %A, i64 %B, i64 %C, i1 zeroext %cond) {
 ; RV32SFB:       # %bb.0: # %entry
 ; RV32SFB-NEXT:    bnez a6, .LBB37_2
 ; RV32SFB-NEXT:  # %bb.1: # %entry
-; RV32SFB-NEXT:    orn a4, a0, a2
+; RV32SFB-NEXT:    orn a5, a1, a3
 ; RV32SFB-NEXT:  .LBB37_2: # %entry
 ; RV32SFB-NEXT:    bnez a6, .LBB37_4
 ; RV32SFB-NEXT:  # %bb.3: # %entry
-; RV32SFB-NEXT:    orn a5, a1, a3
+; RV32SFB-NEXT:    orn a4, a0, a2
 ; RV32SFB-NEXT:  .LBB37_4: # %entry
 ; RV32SFB-NEXT:    mv a0, a4
 ; RV32SFB-NEXT:    mv a1, a5
@@ -1663,11 +1663,11 @@ define i64 @select_xnor(i64 %A, i64 %B, i64 %C, i1 zeroext %cond) {
 ; RV32SFB:       # %bb.0: # %entry
 ; RV32SFB-NEXT:    bnez a6, .LBB38_2
 ; RV32SFB-NEXT:  # %bb.1: # %entry
-; RV32SFB-NEXT:    xnor a4, a0, a2
+; RV32SFB-NEXT:    xnor a5, a1, a3
 ; RV32SFB-NEXT:  .LBB38_2: # %entry
 ; RV32SFB-NEXT:    bnez a6, .LBB38_4
 ; RV32SFB-NEXT:  # %bb.3: # %entry
-; RV32SFB-NEXT:    xnor a5, a1, a3
+; RV32SFB-NEXT:    xnor a4, a0, a2
 ; RV32SFB-NEXT:  .LBB38_4: # %entry
 ; RV32SFB-NEXT:    mv a0, a4
 ; RV32SFB-NEXT:    mv a1, a5

diff  --git a/llvm/test/CodeGen/RISCV/signed-truncation-check.ll b/llvm/test/CodeGen/RISCV/signed-truncation-check.ll
index 54b85fab757ca1..d43dfd46d62fce 100644
--- a/llvm/test/CodeGen/RISCV/signed-truncation-check.ll
+++ b/llvm/test/CodeGen/RISCV/signed-truncation-check.ll
@@ -25,8 +25,8 @@ define i1 @shifts_eqcmp_i16_i8(i16 %x) nounwind {
 ; RV32I-LABEL: shifts_eqcmp_i16_i8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a1, a0, 16
-; RV32I-NEXT:    srli a1, a1, 16
 ; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    srli a1, a1, 16
 ; RV32I-NEXT:    srai a0, a0, 8
 ; RV32I-NEXT:    srli a0, a0, 16
 ; RV32I-NEXT:    xor a0, a0, a1
@@ -36,8 +36,8 @@ define i1 @shifts_eqcmp_i16_i8(i16 %x) nounwind {
 ; RV64I-LABEL: shifts_eqcmp_i16_i8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a0, 48
-; RV64I-NEXT:    srli a1, a1, 48
 ; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srli a1, a1, 48
 ; RV64I-NEXT:    srai a0, a0, 8
 ; RV64I-NEXT:    srli a0, a0, 48
 ; RV64I-NEXT:    xor a0, a0, a1
@@ -459,10 +459,10 @@ define i1 @add_ugecmp_i64_i8(i64 %x) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi a2, a0, -128
 ; RV32I-NEXT:    sltu a0, a2, a0
+; RV32I-NEXT:    sltiu a2, a2, -256
 ; RV32I-NEXT:    add a0, a1, a0
 ; RV32I-NEXT:    seqz a0, a0
-; RV32I-NEXT:    sltiu a1, a2, -256
-; RV32I-NEXT:    xori a1, a1, 1
+; RV32I-NEXT:    xori a1, a2, 1
 ; RV32I-NEXT:    and a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
@@ -794,9 +794,9 @@ define i1 @add_ultcmp_bad_i16_i8_cmp(i16 %x, i16 %y) nounwind {
 ; RV32I-LABEL: add_ultcmp_bad_i16_i8_cmp:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 16
+; RV32I-NEXT:    addi a0, a0, 128
 ; RV32I-NEXT:    addi a2, a2, -1
 ; RV32I-NEXT:    and a1, a1, a2
-; RV32I-NEXT:    addi a0, a0, 128
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    sltu a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -804,9 +804,9 @@ define i1 @add_ultcmp_bad_i16_i8_cmp(i16 %x, i16 %y) nounwind {
 ; RV64I-LABEL: add_ultcmp_bad_i16_i8_cmp:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addi a0, a0, 128
 ; RV64I-NEXT:    addiw a2, a2, -1
 ; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    addi a0, a0, 128
 ; RV64I-NEXT:    and a0, a0, a2
 ; RV64I-NEXT:    sltu a0, a0, a1
 ; RV64I-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll
index 284b559eeec5f5..8f5b044c3b3b83 100644
--- a/llvm/test/CodeGen/RISCV/split-offsets.ll
+++ b/llvm/test/CodeGen/RISCV/split-offsets.ll
@@ -11,32 +11,32 @@
 define void @test1(ptr %sp, ptr %t, i32 %n) {
 ; RV32I-LABEL: test1:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    lw a0, 0(a0)
 ; RV32I-NEXT:    lui a2, 20
+; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    li a3, 2
 ; RV32I-NEXT:    addi a2, a2, -1920
 ; RV32I-NEXT:    add a1, a1, a2
 ; RV32I-NEXT:    add a0, a0, a2
-; RV32I-NEXT:    li a2, 2
-; RV32I-NEXT:    li a3, 1
-; RV32I-NEXT:    sw a2, 0(a0)
-; RV32I-NEXT:    sw a3, 4(a0)
-; RV32I-NEXT:    sw a3, 0(a1)
-; RV32I-NEXT:    sw a2, 4(a1)
+; RV32I-NEXT:    li a2, 1
+; RV32I-NEXT:    sw a3, 0(a0)
+; RV32I-NEXT:    sw a2, 4(a0)
+; RV32I-NEXT:    sw a2, 0(a1)
+; RV32I-NEXT:    sw a3, 4(a1)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test1:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    ld a0, 0(a0)
 ; RV64I-NEXT:    lui a2, 20
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    li a3, 2
 ; RV64I-NEXT:    addiw a2, a2, -1920
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    add a0, a0, a2
-; RV64I-NEXT:    li a2, 2
-; RV64I-NEXT:    li a3, 1
-; RV64I-NEXT:    sw a2, 0(a0)
-; RV64I-NEXT:    sw a3, 4(a0)
-; RV64I-NEXT:    sw a3, 0(a1)
-; RV64I-NEXT:    sw a2, 4(a1)
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    sw a3, 0(a0)
+; RV64I-NEXT:    sw a2, 4(a0)
+; RV64I-NEXT:    sw a2, 0(a1)
+; RV64I-NEXT:    sw a3, 4(a1)
 ; RV64I-NEXT:    ret
 entry:
   %s = load ptr, ptr %sp
@@ -125,23 +125,23 @@ define void @test3(ptr %t) {
 ; RV32I-LABEL: test3:
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:    lui a1, 20
+; RV32I-NEXT:    li a2, 2
 ; RV32I-NEXT:    addi a1, a1, -1920
 ; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    li a1, 2
-; RV32I-NEXT:    li a2, 3
-; RV32I-NEXT:    sw a1, 4(a0)
-; RV32I-NEXT:    sw a2, 8(a0)
+; RV32I-NEXT:    li a1, 3
+; RV32I-NEXT:    sw a2, 4(a0)
+; RV32I-NEXT:    sw a1, 8(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: test3:
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:    lui a1, 20
+; RV64I-NEXT:    li a2, 2
 ; RV64I-NEXT:    addiw a1, a1, -1920
 ; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    li a1, 2
-; RV64I-NEXT:    li a2, 3
-; RV64I-NEXT:    sw a1, 4(a0)
-; RV64I-NEXT:    sw a2, 8(a0)
+; RV64I-NEXT:    li a1, 3
+; RV64I-NEXT:    sw a2, 4(a0)
+; RV64I-NEXT:    sw a1, 8(a0)
 ; RV64I-NEXT:    ret
 entry:
   %splitgep = getelementptr i8, ptr %t, i64 80000

diff  --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
index 5fa802b7f27cad..83ae03452db5b3 100644
--- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll
@@ -8,52 +8,52 @@ define iXLen2 @test_udiv_3(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_udiv_3:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    add a2, a0, a1
-; RV32-NEXT:    sltu a3, a2, a0
-; RV32-NEXT:    add a2, a2, a3
 ; RV32-NEXT:    lui a3, 699051
-; RV32-NEXT:    addi a4, a3, -1365
-; RV32-NEXT:    mulhu a5, a2, a4
-; RV32-NEXT:    srli a6, a5, 1
-; RV32-NEXT:    andi a5, a5, -2
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    sub a2, a2, a5
-; RV32-NEXT:    sub a5, a0, a2
+; RV32-NEXT:    sltu a4, a2, a0
+; RV32-NEXT:    addi a5, a3, -1365
 ; RV32-NEXT:    addi a3, a3, -1366
-; RV32-NEXT:    mul a3, a5, a3
-; RV32-NEXT:    mulhu a6, a5, a4
-; RV32-NEXT:    add a3, a6, a3
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    mulhu a4, a2, a5
+; RV32-NEXT:    srli a6, a4, 1
+; RV32-NEXT:    andi a4, a4, -2
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    sub a2, a2, a4
+; RV32-NEXT:    sub a4, a0, a2
 ; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    mul a2, a4, a3
+; RV32-NEXT:    mulhu a3, a4, a5
 ; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    mul a1, a1, a4
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    mul a0, a5, a4
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    mul a1, a1, a5
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    mul a0, a4, a5
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_udiv_3:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    add a2, a0, a1
-; RV64-NEXT:    sltu a3, a2, a0
-; RV64-NEXT:    add a2, a2, a3
 ; RV64-NEXT:    lui a3, 699051
+; RV64-NEXT:    lui a4, %hi(.LCPI0_0)
+; RV64-NEXT:    sltu a5, a2, a0
 ; RV64-NEXT:    addiw a3, a3, -1365
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    mulhu a4, a2, a3
-; RV64-NEXT:    srli a5, a4, 1
-; RV64-NEXT:    andi a4, a4, -2
-; RV64-NEXT:    lui a6, %hi(.LCPI0_0)
-; RV64-NEXT:    ld a6, %lo(.LCPI0_0)(a6)
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    sub a2, a2, a4
-; RV64-NEXT:    sub a4, a0, a2
-; RV64-NEXT:    mul a5, a4, a6
-; RV64-NEXT:    mulhu a6, a4, a3
-; RV64-NEXT:    add a5, a6, a5
+; RV64-NEXT:    ld a4, %lo(.LCPI0_0)(a4)
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    mulhu a5, a2, a3
+; RV64-NEXT:    srli a6, a5, 1
+; RV64-NEXT:    andi a5, a5, -2
+; RV64-NEXT:    add a5, a5, a6
+; RV64-NEXT:    sub a2, a2, a5
+; RV64-NEXT:    sub a5, a0, a2
 ; RV64-NEXT:    sltu a0, a0, a2
+; RV64-NEXT:    mul a2, a5, a4
+; RV64-NEXT:    mulhu a4, a5, a3
 ; RV64-NEXT:    sub a1, a1, a0
+; RV64-NEXT:    add a2, a4, a2
 ; RV64-NEXT:    mul a1, a1, a3
-; RV64-NEXT:    add a1, a5, a1
-; RV64-NEXT:    mul a0, a4, a3
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    mul a0, a5, a3
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 3
   ret iXLen2 %a
@@ -63,52 +63,52 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_udiv_5:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    add a2, a0, a1
-; RV32-NEXT:    sltu a3, a2, a0
-; RV32-NEXT:    add a2, a2, a3
 ; RV32-NEXT:    lui a3, 838861
-; RV32-NEXT:    addi a4, a3, -819
-; RV32-NEXT:    mulhu a5, a2, a4
-; RV32-NEXT:    srli a6, a5, 2
-; RV32-NEXT:    andi a5, a5, -4
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    sub a2, a2, a5
-; RV32-NEXT:    sub a5, a0, a2
+; RV32-NEXT:    sltu a4, a2, a0
+; RV32-NEXT:    addi a5, a3, -819
 ; RV32-NEXT:    addi a3, a3, -820
-; RV32-NEXT:    mul a3, a5, a3
-; RV32-NEXT:    mulhu a6, a5, a4
-; RV32-NEXT:    add a3, a6, a3
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    mulhu a4, a2, a5
+; RV32-NEXT:    srli a6, a4, 2
+; RV32-NEXT:    andi a4, a4, -4
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    sub a2, a2, a4
+; RV32-NEXT:    sub a4, a0, a2
 ; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    mul a2, a4, a3
+; RV32-NEXT:    mulhu a3, a4, a5
 ; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    mul a1, a1, a4
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    mul a0, a5, a4
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    mul a1, a1, a5
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    mul a0, a4, a5
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_udiv_5:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    add a2, a0, a1
-; RV64-NEXT:    sltu a3, a2, a0
-; RV64-NEXT:    add a2, a2, a3
 ; RV64-NEXT:    lui a3, 838861
+; RV64-NEXT:    lui a4, %hi(.LCPI1_0)
+; RV64-NEXT:    sltu a5, a2, a0
 ; RV64-NEXT:    addiw a3, a3, -819
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    mulhu a4, a2, a3
-; RV64-NEXT:    srli a5, a4, 2
-; RV64-NEXT:    andi a4, a4, -4
-; RV64-NEXT:    lui a6, %hi(.LCPI1_0)
-; RV64-NEXT:    ld a6, %lo(.LCPI1_0)(a6)
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    sub a2, a2, a4
-; RV64-NEXT:    sub a4, a0, a2
-; RV64-NEXT:    mul a5, a4, a6
-; RV64-NEXT:    mulhu a6, a4, a3
-; RV64-NEXT:    add a5, a6, a5
+; RV64-NEXT:    ld a4, %lo(.LCPI1_0)(a4)
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    mulhu a5, a2, a3
+; RV64-NEXT:    srli a6, a5, 2
+; RV64-NEXT:    andi a5, a5, -4
+; RV64-NEXT:    add a5, a5, a6
+; RV64-NEXT:    sub a2, a2, a5
+; RV64-NEXT:    sub a5, a0, a2
 ; RV64-NEXT:    sltu a0, a0, a2
+; RV64-NEXT:    mul a2, a5, a4
+; RV64-NEXT:    mulhu a4, a5, a3
 ; RV64-NEXT:    sub a1, a1, a0
+; RV64-NEXT:    add a2, a4, a2
 ; RV64-NEXT:    mul a1, a1, a3
-; RV64-NEXT:    add a1, a5, a1
-; RV64-NEXT:    mul a0, a4, a3
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    mul a0, a5, a3
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 5
   ret iXLen2 %a
@@ -170,57 +170,57 @@ define iXLen2 @test_udiv_15(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_udiv_15:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    add a2, a0, a1
-; RV32-NEXT:    sltu a3, a2, a0
-; RV32-NEXT:    add a2, a2, a3
 ; RV32-NEXT:    lui a3, 559241
+; RV32-NEXT:    lui a4, 978671
+; RV32-NEXT:    sltu a5, a2, a0
 ; RV32-NEXT:    addi a3, a3, -1911
+; RV32-NEXT:    addi a6, a4, -274
+; RV32-NEXT:    addi a4, a4, -273
+; RV32-NEXT:    add a2, a2, a5
 ; RV32-NEXT:    mulhu a3, a2, a3
 ; RV32-NEXT:    srli a3, a3, 3
-; RV32-NEXT:    slli a4, a3, 4
-; RV32-NEXT:    sub a3, a3, a4
+; RV32-NEXT:    slli a5, a3, 4
+; RV32-NEXT:    sub a3, a3, a5
 ; RV32-NEXT:    add a2, a2, a3
 ; RV32-NEXT:    sub a3, a0, a2
-; RV32-NEXT:    lui a4, 978671
-; RV32-NEXT:    addi a5, a4, -274
-; RV32-NEXT:    mul a5, a3, a5
-; RV32-NEXT:    addi a4, a4, -273
-; RV32-NEXT:    mulhu a6, a3, a4
-; RV32-NEXT:    add a5, a6, a5
 ; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    mul a2, a3, a6
+; RV32-NEXT:    mulhu a5, a3, a4
 ; RV32-NEXT:    sub a1, a1, a0
+; RV32-NEXT:    add a2, a5, a2
 ; RV32-NEXT:    mul a1, a1, a4
-; RV32-NEXT:    add a1, a5, a1
+; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    mul a0, a3, a4
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_udiv_15:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    add a2, a0, a1
-; RV64-NEXT:    sltu a3, a2, a0
-; RV64-NEXT:    add a2, a2, a3
 ; RV64-NEXT:    lui a3, 559241
+; RV64-NEXT:    lui a4, %hi(.LCPI4_0)
+; RV64-NEXT:    lui a5, 978671
+; RV64-NEXT:    sltu a6, a2, a0
 ; RV64-NEXT:    addiw a3, a3, -1911
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    ld a4, %lo(.LCPI4_0)(a4)
+; RV64-NEXT:    addiw a5, a5, -273
+; RV64-NEXT:    add a2, a2, a6
+; RV64-NEXT:    slli a6, a3, 32
+; RV64-NEXT:    add a3, a3, a6
+; RV64-NEXT:    slli a6, a5, 32
+; RV64-NEXT:    add a5, a5, a6
 ; RV64-NEXT:    mulhu a3, a2, a3
 ; RV64-NEXT:    srli a3, a3, 3
-; RV64-NEXT:    slli a4, a3, 4
-; RV64-NEXT:    lui a5, %hi(.LCPI4_0)
-; RV64-NEXT:    ld a5, %lo(.LCPI4_0)(a5)
-; RV64-NEXT:    sub a3, a3, a4
+; RV64-NEXT:    slli a6, a3, 4
+; RV64-NEXT:    sub a3, a3, a6
 ; RV64-NEXT:    add a2, a2, a3
 ; RV64-NEXT:    sub a3, a0, a2
-; RV64-NEXT:    mul a4, a3, a5
-; RV64-NEXT:    lui a5, 978671
-; RV64-NEXT:    addiw a5, a5, -273
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    mulhu a6, a3, a5
-; RV64-NEXT:    add a4, a6, a4
 ; RV64-NEXT:    sltu a0, a0, a2
+; RV64-NEXT:    mul a2, a3, a4
+; RV64-NEXT:    mulhu a4, a3, a5
 ; RV64-NEXT:    sub a1, a1, a0
+; RV64-NEXT:    add a2, a4, a2
 ; RV64-NEXT:    mul a1, a1, a5
-; RV64-NEXT:    add a1, a4, a1
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    mul a0, a3, a5
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 15
@@ -231,52 +231,52 @@ define iXLen2 @test_udiv_17(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_udiv_17:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    add a2, a0, a1
-; RV32-NEXT:    sltu a3, a2, a0
-; RV32-NEXT:    add a2, a2, a3
 ; RV32-NEXT:    lui a3, 986895
-; RV32-NEXT:    addi a4, a3, 241
-; RV32-NEXT:    mulhu a5, a2, a4
-; RV32-NEXT:    srli a6, a5, 4
-; RV32-NEXT:    andi a5, a5, -16
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    sub a2, a2, a5
-; RV32-NEXT:    sub a5, a0, a2
+; RV32-NEXT:    sltu a4, a2, a0
+; RV32-NEXT:    addi a5, a3, 241
 ; RV32-NEXT:    addi a3, a3, 240
-; RV32-NEXT:    mul a3, a5, a3
-; RV32-NEXT:    mulhu a6, a5, a4
-; RV32-NEXT:    add a3, a6, a3
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    mulhu a4, a2, a5
+; RV32-NEXT:    srli a6, a4, 4
+; RV32-NEXT:    andi a4, a4, -16
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    sub a2, a2, a4
+; RV32-NEXT:    sub a4, a0, a2
 ; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    mul a2, a4, a3
+; RV32-NEXT:    mulhu a3, a4, a5
 ; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    mul a1, a1, a4
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    mul a0, a5, a4
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    mul a1, a1, a5
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    mul a0, a4, a5
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_udiv_17:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    add a2, a0, a1
-; RV64-NEXT:    sltu a3, a2, a0
-; RV64-NEXT:    add a2, a2, a3
 ; RV64-NEXT:    lui a3, 986895
+; RV64-NEXT:    lui a4, %hi(.LCPI5_0)
+; RV64-NEXT:    sltu a5, a2, a0
 ; RV64-NEXT:    addiw a3, a3, 241
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    mulhu a4, a2, a3
-; RV64-NEXT:    srli a5, a4, 4
-; RV64-NEXT:    andi a4, a4, -16
-; RV64-NEXT:    lui a6, %hi(.LCPI5_0)
-; RV64-NEXT:    ld a6, %lo(.LCPI5_0)(a6)
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    sub a2, a2, a4
-; RV64-NEXT:    sub a4, a0, a2
-; RV64-NEXT:    mul a5, a4, a6
-; RV64-NEXT:    mulhu a6, a4, a3
-; RV64-NEXT:    add a5, a6, a5
+; RV64-NEXT:    ld a4, %lo(.LCPI5_0)(a4)
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    mulhu a5, a2, a3
+; RV64-NEXT:    srli a6, a5, 4
+; RV64-NEXT:    andi a5, a5, -16
+; RV64-NEXT:    add a5, a5, a6
+; RV64-NEXT:    sub a2, a2, a5
+; RV64-NEXT:    sub a5, a0, a2
 ; RV64-NEXT:    sltu a0, a0, a2
+; RV64-NEXT:    mul a2, a5, a4
+; RV64-NEXT:    mulhu a4, a5, a3
 ; RV64-NEXT:    sub a1, a1, a0
+; RV64-NEXT:    add a2, a4, a2
 ; RV64-NEXT:    mul a1, a1, a3
-; RV64-NEXT:    add a1, a5, a1
-; RV64-NEXT:    mul a0, a4, a3
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    mul a0, a5, a3
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 17
   ret iXLen2 %a
@@ -286,57 +286,57 @@ define iXLen2 @test_udiv_255(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_udiv_255:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    add a2, a0, a1
-; RV32-NEXT:    sltu a3, a2, a0
-; RV32-NEXT:    add a2, a2, a3
 ; RV32-NEXT:    lui a3, 526344
+; RV32-NEXT:    lui a4, 1044464
+; RV32-NEXT:    sltu a5, a2, a0
 ; RV32-NEXT:    addi a3, a3, 129
+; RV32-NEXT:    addi a6, a4, -258
+; RV32-NEXT:    addi a4, a4, -257
+; RV32-NEXT:    add a2, a2, a5
 ; RV32-NEXT:    mulhu a3, a2, a3
 ; RV32-NEXT:    srli a3, a3, 7
-; RV32-NEXT:    slli a4, a3, 8
-; RV32-NEXT:    sub a3, a3, a4
+; RV32-NEXT:    slli a5, a3, 8
+; RV32-NEXT:    sub a3, a3, a5
 ; RV32-NEXT:    add a2, a2, a3
 ; RV32-NEXT:    sub a3, a0, a2
-; RV32-NEXT:    lui a4, 1044464
-; RV32-NEXT:    addi a5, a4, -258
-; RV32-NEXT:    mul a5, a3, a5
-; RV32-NEXT:    addi a4, a4, -257
-; RV32-NEXT:    mulhu a6, a3, a4
-; RV32-NEXT:    add a5, a6, a5
 ; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    mul a2, a3, a6
+; RV32-NEXT:    mulhu a5, a3, a4
 ; RV32-NEXT:    sub a1, a1, a0
+; RV32-NEXT:    add a2, a5, a2
 ; RV32-NEXT:    mul a1, a1, a4
-; RV32-NEXT:    add a1, a5, a1
+; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    mul a0, a3, a4
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_udiv_255:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    add a2, a0, a1
-; RV64-NEXT:    sltu a3, a2, a0
-; RV64-NEXT:    add a2, a2, a3
 ; RV64-NEXT:    lui a3, 526344
+; RV64-NEXT:    lui a4, %hi(.LCPI6_0)
+; RV64-NEXT:    lui a5, 1044464
+; RV64-NEXT:    sltu a6, a2, a0
 ; RV64-NEXT:    addiw a3, a3, 129
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
+; RV64-NEXT:    ld a4, %lo(.LCPI6_0)(a4)
+; RV64-NEXT:    addiw a5, a5, -257
+; RV64-NEXT:    add a2, a2, a6
+; RV64-NEXT:    slli a6, a3, 32
+; RV64-NEXT:    add a3, a3, a6
+; RV64-NEXT:    slli a6, a5, 32
+; RV64-NEXT:    add a5, a5, a6
 ; RV64-NEXT:    mulhu a3, a2, a3
 ; RV64-NEXT:    srli a3, a3, 7
-; RV64-NEXT:    slli a4, a3, 8
-; RV64-NEXT:    lui a5, %hi(.LCPI6_0)
-; RV64-NEXT:    ld a5, %lo(.LCPI6_0)(a5)
-; RV64-NEXT:    sub a3, a3, a4
+; RV64-NEXT:    slli a6, a3, 8
+; RV64-NEXT:    sub a3, a3, a6
 ; RV64-NEXT:    add a2, a2, a3
 ; RV64-NEXT:    sub a3, a0, a2
-; RV64-NEXT:    mul a4, a3, a5
-; RV64-NEXT:    lui a5, 1044464
-; RV64-NEXT:    addiw a5, a5, -257
-; RV64-NEXT:    slli a6, a5, 32
-; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    mulhu a6, a3, a5
-; RV64-NEXT:    add a4, a6, a4
 ; RV64-NEXT:    sltu a0, a0, a2
+; RV64-NEXT:    mul a2, a3, a4
+; RV64-NEXT:    mulhu a4, a3, a5
 ; RV64-NEXT:    sub a1, a1, a0
+; RV64-NEXT:    add a2, a4, a2
 ; RV64-NEXT:    mul a1, a1, a5
-; RV64-NEXT:    add a1, a4, a1
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    mul a0, a3, a5
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 255
@@ -347,52 +347,52 @@ define iXLen2 @test_udiv_257(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_udiv_257:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    add a2, a0, a1
-; RV32-NEXT:    sltu a3, a2, a0
-; RV32-NEXT:    add a2, a2, a3
 ; RV32-NEXT:    lui a3, 1044496
-; RV32-NEXT:    addi a4, a3, -255
-; RV32-NEXT:    mulhu a5, a2, a4
-; RV32-NEXT:    srli a6, a5, 8
-; RV32-NEXT:    andi a5, a5, -256
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    sub a2, a2, a5
-; RV32-NEXT:    sub a5, a0, a2
+; RV32-NEXT:    sltu a4, a2, a0
+; RV32-NEXT:    addi a5, a3, -255
 ; RV32-NEXT:    addi a3, a3, -256
-; RV32-NEXT:    mul a3, a5, a3
-; RV32-NEXT:    mulhu a6, a5, a4
-; RV32-NEXT:    add a3, a6, a3
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    mulhu a4, a2, a5
+; RV32-NEXT:    srli a6, a4, 8
+; RV32-NEXT:    andi a4, a4, -256
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    sub a2, a2, a4
+; RV32-NEXT:    sub a4, a0, a2
 ; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    mul a2, a4, a3
+; RV32-NEXT:    mulhu a3, a4, a5
 ; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    mul a1, a1, a4
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    mul a0, a5, a4
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    mul a1, a1, a5
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    mul a0, a4, a5
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_udiv_257:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    add a2, a0, a1
-; RV64-NEXT:    sltu a3, a2, a0
-; RV64-NEXT:    add a2, a2, a3
 ; RV64-NEXT:    lui a3, 1044496
+; RV64-NEXT:    lui a4, %hi(.LCPI7_0)
+; RV64-NEXT:    sltu a5, a2, a0
 ; RV64-NEXT:    addiw a3, a3, -255
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    mulhu a4, a2, a3
-; RV64-NEXT:    srli a5, a4, 8
-; RV64-NEXT:    andi a4, a4, -256
-; RV64-NEXT:    lui a6, %hi(.LCPI7_0)
-; RV64-NEXT:    ld a6, %lo(.LCPI7_0)(a6)
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    sub a2, a2, a4
-; RV64-NEXT:    sub a4, a0, a2
-; RV64-NEXT:    mul a5, a4, a6
-; RV64-NEXT:    mulhu a6, a4, a3
-; RV64-NEXT:    add a5, a6, a5
+; RV64-NEXT:    ld a4, %lo(.LCPI7_0)(a4)
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    mulhu a5, a2, a3
+; RV64-NEXT:    srli a6, a5, 8
+; RV64-NEXT:    andi a5, a5, -256
+; RV64-NEXT:    add a5, a5, a6
+; RV64-NEXT:    sub a2, a2, a5
+; RV64-NEXT:    sub a5, a0, a2
 ; RV64-NEXT:    sltu a0, a0, a2
+; RV64-NEXT:    mul a2, a5, a4
+; RV64-NEXT:    mulhu a4, a5, a3
 ; RV64-NEXT:    sub a1, a1, a0
+; RV64-NEXT:    add a2, a4, a2
 ; RV64-NEXT:    mul a1, a1, a3
-; RV64-NEXT:    add a1, a5, a1
-; RV64-NEXT:    mul a0, a4, a3
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    mul a0, a5, a3
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 257
   ret iXLen2 %a
@@ -402,63 +402,63 @@ define iXLen2 @test_udiv_65535(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_udiv_65535:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    add a2, a0, a1
-; RV32-NEXT:    sltu a3, a2, a0
-; RV32-NEXT:    add a2, a2, a3
 ; RV32-NEXT:    lui a3, 524296
+; RV32-NEXT:    lui a4, 1048560
+; RV32-NEXT:    sltu a5, a2, a0
 ; RV32-NEXT:    addi a3, a3, 1
+; RV32-NEXT:    add a2, a2, a5
 ; RV32-NEXT:    mulhu a3, a2, a3
 ; RV32-NEXT:    srli a3, a3, 15
-; RV32-NEXT:    slli a4, a3, 16
-; RV32-NEXT:    sub a3, a3, a4
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    sub a3, a0, a2
-; RV32-NEXT:    lui a4, 1048560
+; RV32-NEXT:    slli a5, a3, 16
+; RV32-NEXT:    sub a3, a3, a5
 ; RV32-NEXT:    addi a5, a4, -2
-; RV32-NEXT:    mul a5, a3, a5
 ; RV32-NEXT:    addi a4, a4, -1
-; RV32-NEXT:    mulhu a4, a3, a4
-; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    sub a3, a0, a2
 ; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    mul a2, a3, a5
+; RV32-NEXT:    mulhu a4, a3, a4
 ; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    slli a0, a1, 16
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    sub a1, a4, a0
 ; RV32-NEXT:    slli a0, a3, 16
-; RV32-NEXT:    neg a2, a3
-; RV32-NEXT:    sub a0, a2, a0
+; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    add a2, a4, a2
+; RV32-NEXT:    slli a4, a1, 16
+; RV32-NEXT:    add a1, a4, a1
+; RV32-NEXT:    sub a1, a2, a1
+; RV32-NEXT:    sub a0, a3, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_udiv_65535:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    add a2, a0, a1
-; RV64-NEXT:    sltu a3, a2, a0
-; RV64-NEXT:    add a2, a2, a3
 ; RV64-NEXT:    lui a3, 524296
-; RV64-NEXT:    addiw a3, a3, 1
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    mulhu a3, a2, a3
-; RV64-NEXT:    srli a3, a3, 15
-; RV64-NEXT:    slli a4, a3, 16
-; RV64-NEXT:    sub a3, a3, a4
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    sub a3, a0, a2
 ; RV64-NEXT:    lui a4, 983039
-; RV64-NEXT:    slli a4, a4, 4
-; RV64-NEXT:    addi a4, a4, -1
-; RV64-NEXT:    slli a4, a4, 16
-; RV64-NEXT:    addi a4, a4, -2
-; RV64-NEXT:    mul a4, a3, a4
 ; RV64-NEXT:    lui a5, 1048560
+; RV64-NEXT:    sltu a6, a2, a0
+; RV64-NEXT:    addiw a3, a3, 1
+; RV64-NEXT:    slli a4, a4, 4
 ; RV64-NEXT:    addiw a5, a5, -1
+; RV64-NEXT:    add a2, a2, a6
+; RV64-NEXT:    slli a6, a3, 32
+; RV64-NEXT:    addi a4, a4, -1
+; RV64-NEXT:    add a3, a3, a6
 ; RV64-NEXT:    slli a6, a5, 32
+; RV64-NEXT:    slli a4, a4, 16
 ; RV64-NEXT:    add a5, a5, a6
-; RV64-NEXT:    mulhu a6, a3, a5
-; RV64-NEXT:    add a4, a6, a4
+; RV64-NEXT:    mulhu a3, a2, a3
+; RV64-NEXT:    addi a4, a4, -2
+; RV64-NEXT:    srli a3, a3, 15
+; RV64-NEXT:    slli a6, a3, 16
+; RV64-NEXT:    sub a3, a3, a6
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    sub a3, a0, a2
 ; RV64-NEXT:    sltu a0, a0, a2
+; RV64-NEXT:    mul a2, a3, a4
+; RV64-NEXT:    mulhu a4, a3, a5
 ; RV64-NEXT:    sub a1, a1, a0
+; RV64-NEXT:    add a2, a4, a2
 ; RV64-NEXT:    mul a1, a1, a5
-; RV64-NEXT:    add a1, a4, a1
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    mul a0, a3, a5
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 65535
@@ -469,54 +469,54 @@ define iXLen2 @test_udiv_65537(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_udiv_65537:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    add a2, a0, a1
-; RV32-NEXT:    sltu a3, a2, a0
-; RV32-NEXT:    add a2, a2, a3
 ; RV32-NEXT:    lui a3, 1048560
-; RV32-NEXT:    addi a4, a3, 1
-; RV32-NEXT:    mulhu a5, a2, a4
-; RV32-NEXT:    and a3, a5, a3
-; RV32-NEXT:    srli a5, a5, 16
-; RV32-NEXT:    or a3, a3, a5
+; RV32-NEXT:    sltu a4, a2, a0
+; RV32-NEXT:    addi a5, a3, 1
+; RV32-NEXT:    add a2, a2, a4
+; RV32-NEXT:    mulhu a4, a2, a5
+; RV32-NEXT:    and a3, a4, a3
+; RV32-NEXT:    srli a4, a4, 16
+; RV32-NEXT:    or a3, a3, a4
 ; RV32-NEXT:    sub a2, a2, a3
 ; RV32-NEXT:    sub a3, a0, a2
-; RV32-NEXT:    mulhu a4, a3, a4
-; RV32-NEXT:    slli a5, a3, 16
-; RV32-NEXT:    sub a4, a4, a5
 ; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    mulhu a2, a3, a5
+; RV32-NEXT:    slli a4, a3, 16
 ; RV32-NEXT:    sub a1, a1, a0
+; RV32-NEXT:    sub a2, a2, a4
 ; RV32-NEXT:    slli a0, a1, 16
 ; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    add a1, a4, a1
-; RV32-NEXT:    sub a0, a3, a5
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    sub a0, a3, a4
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_udiv_65537:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    add a2, a0, a1
-; RV64-NEXT:    sltu a3, a2, a0
-; RV64-NEXT:    add a2, a2, a3
 ; RV64-NEXT:    lui a3, 1048560
-; RV64-NEXT:    addiw a4, a3, 1
-; RV64-NEXT:    slli a5, a4, 32
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    mulhu a5, a2, a4
-; RV64-NEXT:    and a3, a5, a3
-; RV64-NEXT:    srli a5, a5, 16
-; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    lui a4, 983041
+; RV64-NEXT:    sltu a5, a2, a0
+; RV64-NEXT:    addiw a6, a3, 1
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a6, 32
+; RV64-NEXT:    addi a4, a4, -1
+; RV64-NEXT:    add a5, a6, a5
+; RV64-NEXT:    slli a4, a4, 16
+; RV64-NEXT:    mulhu a6, a2, a5
+; RV64-NEXT:    and a3, a6, a3
+; RV64-NEXT:    srli a6, a6, 16
+; RV64-NEXT:    add a3, a3, a6
 ; RV64-NEXT:    sub a2, a2, a3
 ; RV64-NEXT:    sub a3, a0, a2
-; RV64-NEXT:    lui a5, 983041
-; RV64-NEXT:    slli a5, a5, 4
-; RV64-NEXT:    addi a5, a5, -1
-; RV64-NEXT:    slli a5, a5, 16
-; RV64-NEXT:    mul a5, a3, a5
-; RV64-NEXT:    mulhu a6, a3, a4
-; RV64-NEXT:    add a5, a6, a5
 ; RV64-NEXT:    sltu a0, a0, a2
+; RV64-NEXT:    mul a2, a3, a4
+; RV64-NEXT:    mulhu a4, a3, a5
 ; RV64-NEXT:    sub a1, a1, a0
-; RV64-NEXT:    mul a1, a1, a4
-; RV64-NEXT:    add a1, a5, a1
-; RV64-NEXT:    mul a0, a3, a4
+; RV64-NEXT:    add a2, a4, a2
+; RV64-NEXT:    mul a1, a1, a5
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    mul a0, a3, a5
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 65537
   ret iXLen2 %a
@@ -527,59 +527,59 @@ define iXLen2 @test_udiv_12(iXLen2 %x) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a2, a1, 30
 ; RV32-NEXT:    srli a0, a0, 2
-; RV32-NEXT:    or a0, a0, a2
 ; RV32-NEXT:    srli a1, a1, 2
-; RV32-NEXT:    add a2, a0, a1
-; RV32-NEXT:    sltu a3, a2, a0
-; RV32-NEXT:    add a2, a2, a3
 ; RV32-NEXT:    lui a3, 699051
-; RV32-NEXT:    addi a4, a3, -1365
-; RV32-NEXT:    mulhu a5, a2, a4
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    addi a2, a3, -1365
+; RV32-NEXT:    addi a3, a3, -1366
+; RV32-NEXT:    add a4, a0, a1
+; RV32-NEXT:    sltu a5, a4, a0
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    mulhu a5, a4, a2
 ; RV32-NEXT:    srli a6, a5, 1
 ; RV32-NEXT:    andi a5, a5, -2
 ; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    sub a2, a2, a5
-; RV32-NEXT:    sub a5, a0, a2
-; RV32-NEXT:    addi a3, a3, -1366
+; RV32-NEXT:    sub a4, a4, a5
+; RV32-NEXT:    sub a5, a0, a4
+; RV32-NEXT:    sltu a0, a0, a4
 ; RV32-NEXT:    mul a3, a5, a3
-; RV32-NEXT:    mulhu a6, a5, a4
-; RV32-NEXT:    add a3, a6, a3
-; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    mulhu a4, a5, a2
 ; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    mul a1, a1, a4
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    mul a0, a5, a4
+; RV32-NEXT:    mul a0, a5, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: test_udiv_12:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    slli a2, a1, 62
 ; RV64-NEXT:    srli a0, a0, 2
-; RV64-NEXT:    or a0, a0, a2
 ; RV64-NEXT:    srli a1, a1, 2
-; RV64-NEXT:    add a2, a0, a1
-; RV64-NEXT:    sltu a3, a2, a0
-; RV64-NEXT:    add a2, a2, a3
 ; RV64-NEXT:    lui a3, 699051
-; RV64-NEXT:    addiw a3, a3, -1365
-; RV64-NEXT:    slli a4, a3, 32
-; RV64-NEXT:    add a3, a3, a4
-; RV64-NEXT:    mulhu a4, a2, a3
-; RV64-NEXT:    srli a5, a4, 1
-; RV64-NEXT:    andi a4, a4, -2
-; RV64-NEXT:    lui a6, %hi(.LCPI10_0)
-; RV64-NEXT:    ld a6, %lo(.LCPI10_0)(a6)
-; RV64-NEXT:    add a4, a4, a5
-; RV64-NEXT:    sub a2, a2, a4
-; RV64-NEXT:    sub a4, a0, a2
-; RV64-NEXT:    mul a5, a4, a6
-; RV64-NEXT:    mulhu a6, a4, a3
-; RV64-NEXT:    add a5, a6, a5
-; RV64-NEXT:    sltu a0, a0, a2
+; RV64-NEXT:    lui a4, %hi(.LCPI10_0)
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    addiw a2, a3, -1365
+; RV64-NEXT:    ld a3, %lo(.LCPI10_0)(a4)
+; RV64-NEXT:    add a4, a0, a1
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    sltu a6, a4, a0
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    mulhu a5, a4, a2
+; RV64-NEXT:    srli a6, a5, 1
+; RV64-NEXT:    andi a5, a5, -2
+; RV64-NEXT:    add a5, a5, a6
+; RV64-NEXT:    sub a4, a4, a5
+; RV64-NEXT:    sub a5, a0, a4
+; RV64-NEXT:    sltu a0, a0, a4
+; RV64-NEXT:    mul a3, a5, a3
+; RV64-NEXT:    mulhu a4, a5, a2
 ; RV64-NEXT:    sub a1, a1, a0
-; RV64-NEXT:    mul a1, a1, a3
-; RV64-NEXT:    add a1, a5, a1
-; RV64-NEXT:    mul a0, a4, a3
+; RV64-NEXT:    add a3, a4, a3
+; RV64-NEXT:    mul a1, a1, a2
+; RV64-NEXT:    add a1, a3, a1
+; RV64-NEXT:    mul a0, a5, a2
 ; RV64-NEXT:    ret
   %a = udiv iXLen2 %x, 12
   ret iXLen2 %a

diff  --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
index 8444520fcc7718..ae8117c3ce0bd8 100644
--- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
+++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll
@@ -8,11 +8,11 @@ define iXLen2 @test_urem_3(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_urem_3:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    lui a2, 699051
 ; RV32-NEXT:    sltu a0, a1, a0
+; RV32-NEXT:    addi a2, a2, -1365
 ; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    lui a1, 699051
-; RV32-NEXT:    addi a1, a1, -1365
-; RV32-NEXT:    mulhu a1, a0, a1
+; RV32-NEXT:    mulhu a1, a0, a2
 ; RV32-NEXT:    srli a2, a1, 1
 ; RV32-NEXT:    andi a1, a1, -2
 ; RV32-NEXT:    add a1, a1, a2
@@ -23,12 +23,12 @@ define iXLen2 @test_urem_3(iXLen2 %x) nounwind {
 ; RV64-LABEL: test_urem_3:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    lui a2, 699051
 ; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    addiw a2, a2, -1365
 ; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    lui a1, 699051
-; RV64-NEXT:    addiw a1, a1, -1365
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    slli a1, a2, 32
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    mulhu a1, a0, a1
 ; RV64-NEXT:    srli a2, a1, 1
 ; RV64-NEXT:    andi a1, a1, -2
@@ -44,11 +44,11 @@ define iXLen2 @test_urem_5(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_urem_5:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    lui a2, 838861
 ; RV32-NEXT:    sltu a0, a1, a0
+; RV32-NEXT:    addi a2, a2, -819
 ; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    lui a1, 838861
-; RV32-NEXT:    addi a1, a1, -819
-; RV32-NEXT:    mulhu a1, a0, a1
+; RV32-NEXT:    mulhu a1, a0, a2
 ; RV32-NEXT:    srli a2, a1, 2
 ; RV32-NEXT:    andi a1, a1, -4
 ; RV32-NEXT:    add a1, a1, a2
@@ -59,12 +59,12 @@ define iXLen2 @test_urem_5(iXLen2 %x) nounwind {
 ; RV64-LABEL: test_urem_5:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    lui a2, 838861
 ; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    addiw a2, a2, -819
 ; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    lui a1, 838861
-; RV64-NEXT:    addiw a1, a1, -819
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    slli a1, a2, 32
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    mulhu a1, a0, a1
 ; RV64-NEXT:    srli a2, a1, 2
 ; RV64-NEXT:    andi a1, a1, -4
@@ -132,10 +132,10 @@ define iXLen2 @test_urem_15(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_urem_15:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    lui a2, 559241
 ; RV32-NEXT:    sltu a0, a1, a0
 ; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    lui a1, 559241
-; RV32-NEXT:    addi a1, a1, -1911
+; RV32-NEXT:    addi a1, a2, -1911
 ; RV32-NEXT:    mulhu a1, a0, a1
 ; RV32-NEXT:    srli a1, a1, 3
 ; RV32-NEXT:    slli a2, a1, 4
@@ -147,12 +147,12 @@ define iXLen2 @test_urem_15(iXLen2 %x) nounwind {
 ; RV64-LABEL: test_urem_15:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    lui a2, 559241
 ; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    addiw a2, a2, -1911
 ; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    lui a1, 559241
-; RV64-NEXT:    addiw a1, a1, -1911
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    slli a1, a2, 32
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    mulhu a1, a0, a1
 ; RV64-NEXT:    srli a1, a1, 3
 ; RV64-NEXT:    slli a2, a1, 4
@@ -168,11 +168,11 @@ define iXLen2 @test_urem_17(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_urem_17:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    lui a2, 986895
 ; RV32-NEXT:    sltu a0, a1, a0
+; RV32-NEXT:    addi a2, a2, 241
 ; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    lui a1, 986895
-; RV32-NEXT:    addi a1, a1, 241
-; RV32-NEXT:    mulhu a1, a0, a1
+; RV32-NEXT:    mulhu a1, a0, a2
 ; RV32-NEXT:    srli a2, a1, 4
 ; RV32-NEXT:    andi a1, a1, -16
 ; RV32-NEXT:    add a1, a1, a2
@@ -183,12 +183,12 @@ define iXLen2 @test_urem_17(iXLen2 %x) nounwind {
 ; RV64-LABEL: test_urem_17:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    lui a2, 986895
 ; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    addiw a2, a2, 241
 ; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    lui a1, 986895
-; RV64-NEXT:    addiw a1, a1, 241
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    slli a1, a2, 32
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    mulhu a1, a0, a1
 ; RV64-NEXT:    srli a2, a1, 4
 ; RV64-NEXT:    andi a1, a1, -16
@@ -204,10 +204,10 @@ define iXLen2 @test_urem_255(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_urem_255:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    lui a2, 526344
 ; RV32-NEXT:    sltu a0, a1, a0
 ; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    lui a1, 526344
-; RV32-NEXT:    addi a1, a1, 129
+; RV32-NEXT:    addi a1, a2, 129
 ; RV32-NEXT:    mulhu a1, a0, a1
 ; RV32-NEXT:    srli a1, a1, 7
 ; RV32-NEXT:    slli a2, a1, 8
@@ -219,12 +219,12 @@ define iXLen2 @test_urem_255(iXLen2 %x) nounwind {
 ; RV64-LABEL: test_urem_255:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    lui a2, 526344
 ; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    addiw a2, a2, 129
 ; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    lui a1, 526344
-; RV64-NEXT:    addiw a1, a1, 129
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    slli a1, a2, 32
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    mulhu a1, a0, a1
 ; RV64-NEXT:    srli a1, a1, 7
 ; RV64-NEXT:    slli a2, a1, 8
@@ -240,11 +240,11 @@ define iXLen2 @test_urem_257(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_urem_257:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    lui a2, 1044496
 ; RV32-NEXT:    sltu a0, a1, a0
+; RV32-NEXT:    addi a2, a2, -255
 ; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    lui a1, 1044496
-; RV32-NEXT:    addi a1, a1, -255
-; RV32-NEXT:    mulhu a1, a0, a1
+; RV32-NEXT:    mulhu a1, a0, a2
 ; RV32-NEXT:    srli a2, a1, 8
 ; RV32-NEXT:    andi a1, a1, -256
 ; RV32-NEXT:    add a1, a1, a2
@@ -255,12 +255,12 @@ define iXLen2 @test_urem_257(iXLen2 %x) nounwind {
 ; RV64-LABEL: test_urem_257:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    lui a2, 1044496
 ; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    addiw a2, a2, -255
 ; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    lui a1, 1044496
-; RV64-NEXT:    addiw a1, a1, -255
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    slli a1, a2, 32
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    mulhu a1, a0, a1
 ; RV64-NEXT:    srli a2, a1, 8
 ; RV64-NEXT:    andi a1, a1, -256
@@ -276,11 +276,11 @@ define iXLen2 @test_urem_65535(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_urem_65535:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    lui a2, 524296
 ; RV32-NEXT:    sltu a0, a1, a0
 ; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    lui a1, 524296
-; RV32-NEXT:    addi a1, a1, 1
-; RV32-NEXT:    mulhu a1, a0, a1
+; RV32-NEXT:    addi a2, a2, 1
+; RV32-NEXT:    mulhu a1, a0, a2
 ; RV32-NEXT:    srli a1, a1, 15
 ; RV32-NEXT:    slli a2, a1, 16
 ; RV32-NEXT:    sub a1, a1, a2
@@ -291,12 +291,12 @@ define iXLen2 @test_urem_65535(iXLen2 %x) nounwind {
 ; RV64-LABEL: test_urem_65535:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    lui a2, 524296
 ; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    addiw a2, a2, 1
 ; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    lui a1, 524296
-; RV64-NEXT:    addiw a1, a1, 1
-; RV64-NEXT:    slli a2, a1, 32
-; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    slli a1, a2, 32
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    mulhu a1, a0, a1
 ; RV64-NEXT:    srli a1, a1, 15
 ; RV64-NEXT:    slli a2, a1, 16
@@ -312,14 +312,14 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind {
 ; RV32-LABEL: test_urem_65537:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    lui a2, 1048560
 ; RV32-NEXT:    sltu a0, a1, a0
 ; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    lui a1, 1048560
-; RV32-NEXT:    addi a2, a1, 1
-; RV32-NEXT:    mulhu a2, a0, a2
-; RV32-NEXT:    and a1, a2, a1
-; RV32-NEXT:    srli a2, a2, 16
-; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    addi a1, a2, 1
+; RV32-NEXT:    mulhu a1, a0, a1
+; RV32-NEXT:    and a2, a1, a2
+; RV32-NEXT:    srli a1, a1, 16
+; RV32-NEXT:    or a1, a2, a1
 ; RV32-NEXT:    sub a0, a0, a1
 ; RV32-NEXT:    li a1, 0
 ; RV32-NEXT:    ret
@@ -327,16 +327,16 @@ define iXLen2 @test_urem_65537(iXLen2 %x) nounwind {
 ; RV64-LABEL: test_urem_65537:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    lui a2, 1048560
 ; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    addiw a3, a2, 1
 ; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    lui a1, 1048560
-; RV64-NEXT:    addiw a2, a1, 1
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    mulhu a2, a0, a2
-; RV64-NEXT:    and a1, a2, a1
-; RV64-NEXT:    srli a2, a2, 16
-; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    slli a1, a3, 32
+; RV64-NEXT:    add a1, a3, a1
+; RV64-NEXT:    mulhu a1, a0, a1
+; RV64-NEXT:    and a2, a1, a2
+; RV64-NEXT:    srli a1, a1, 16
+; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    sub a0, a0, a1
 ; RV64-NEXT:    li a1, 0
 ; RV64-NEXT:    ret
@@ -349,14 +349,14 @@ define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a2, a1, 30
 ; RV32-NEXT:    srli a3, a0, 2
-; RV32-NEXT:    or a2, a3, a2
 ; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    or a2, a3, a2
+; RV32-NEXT:    lui a3, 699051
+; RV32-NEXT:    addi a3, a3, -1365
 ; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    sltu a2, a1, a2
 ; RV32-NEXT:    add a1, a1, a2
-; RV32-NEXT:    lui a2, 699051
-; RV32-NEXT:    addi a2, a2, -1365
-; RV32-NEXT:    mulhu a2, a1, a2
+; RV32-NEXT:    mulhu a2, a1, a3
 ; RV32-NEXT:    srli a3, a2, 1
 ; RV32-NEXT:    andi a2, a2, -2
 ; RV32-NEXT:    add a2, a2, a3
@@ -371,16 +371,16 @@ define iXLen2 @test_urem_12(iXLen2 %x) nounwind {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    slli a2, a1, 62
 ; RV64-NEXT:    srli a3, a0, 2
+; RV64-NEXT:    lui a4, 699051
 ; RV64-NEXT:    or a2, a3, a2
+; RV64-NEXT:    addiw a3, a4, -1365
+; RV64-NEXT:    slli a4, a3, 32
+; RV64-NEXT:    add a3, a3, a4
 ; RV64-NEXT:    srli a1, a1, 2
 ; RV64-NEXT:    add a1, a2, a1
 ; RV64-NEXT:    sltu a2, a1, a2
 ; RV64-NEXT:    add a1, a1, a2
-; RV64-NEXT:    lui a2, 699051
-; RV64-NEXT:    addiw a2, a2, -1365
-; RV64-NEXT:    slli a3, a2, 32
-; RV64-NEXT:    add a2, a2, a3
-; RV64-NEXT:    mulhu a2, a1, a2
+; RV64-NEXT:    mulhu a2, a1, a3
 ; RV64-NEXT:    srli a3, a2, 1
 ; RV64-NEXT:    andi a2, a2, -2
 ; RV64-NEXT:    add a2, a2, a3

diff  --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index 3ccad02fbb2bf3..4c9d9e5ffdf770 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -17,9 +17,9 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ; RV32-NEXT:    lui a1, 662
 ; RV32-NEXT:    addi a1, a1, -83
 ; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    lui a1, 1324
 ; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    srli a0, a0, 3
-; RV32-NEXT:    lui a1, 1324
 ; RV32-NEXT:    addi a1, a1, -165
 ; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -36,9 +36,9 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ; RV64-NEXT:    lui a1, 662
 ; RV64-NEXT:    addi a1, a1, -83
 ; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    lui a1, 1324
 ; RV64-NEXT:    slli a0, a0, 35
 ; RV64-NEXT:    srli a0, a0, 35
-; RV64-NEXT:    lui a1, 1324
 ; RV64-NEXT:    addiw a1, a1, -165
 ; RV64-NEXT:    sltu a0, a0, a1
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -53,9 +53,9 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ; RV32M-NEXT:    lui a1, 662
 ; RV32M-NEXT:    addi a1, a1, -83
 ; RV32M-NEXT:    add a0, a0, a1
+; RV32M-NEXT:    lui a1, 1324
 ; RV32M-NEXT:    slli a0, a0, 3
 ; RV32M-NEXT:    srli a0, a0, 3
-; RV32M-NEXT:    lui a1, 1324
 ; RV32M-NEXT:    addi a1, a1, -165
 ; RV32M-NEXT:    sltu a0, a0, a1
 ; RV32M-NEXT:    ret
@@ -68,9 +68,9 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ; RV64M-NEXT:    lui a1, 662
 ; RV64M-NEXT:    addi a1, a1, -83
 ; RV64M-NEXT:    add a0, a0, a1
+; RV64M-NEXT:    lui a1, 1324
 ; RV64M-NEXT:    slli a0, a0, 35
 ; RV64M-NEXT:    srli a0, a0, 35
-; RV64M-NEXT:    lui a1, 1324
 ; RV64M-NEXT:    addiw a1, a1, -165
 ; RV64M-NEXT:    sltu a0, a0, a1
 ; RV64M-NEXT:    ret
@@ -83,9 +83,9 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ; RV32MV-NEXT:    lui a1, 662
 ; RV32MV-NEXT:    addi a1, a1, -83
 ; RV32MV-NEXT:    add a0, a0, a1
+; RV32MV-NEXT:    lui a1, 1324
 ; RV32MV-NEXT:    slli a0, a0, 3
 ; RV32MV-NEXT:    srli a0, a0, 3
-; RV32MV-NEXT:    lui a1, 1324
 ; RV32MV-NEXT:    addi a1, a1, -165
 ; RV32MV-NEXT:    sltu a0, a0, a1
 ; RV32MV-NEXT:    ret
@@ -98,9 +98,9 @@ define i1 @test_srem_odd(i29 %X) nounwind {
 ; RV64MV-NEXT:    lui a1, 662
 ; RV64MV-NEXT:    addi a1, a1, -83
 ; RV64MV-NEXT:    add a0, a0, a1
+; RV64MV-NEXT:    lui a1, 1324
 ; RV64MV-NEXT:    slli a0, a0, 35
 ; RV64MV-NEXT:    srli a0, a0, 35
-; RV64MV-NEXT:    lui a1, 1324
 ; RV64MV-NEXT:    addiw a1, a1, -165
 ; RV64MV-NEXT:    sltu a0, a0, a1
 ; RV64MV-NEXT:    ret
@@ -310,18 +310,18 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32-NEXT:    lw a0, 0(a0)
 ; RV32-NEXT:    slli a4, a1, 30
 ; RV32-NEXT:    srli s1, a2, 2
+; RV32-NEXT:    slli a5, a2, 31
 ; RV32-NEXT:    or s1, s1, a4
-; RV32-NEXT:    slli a4, a2, 31
-; RV32-NEXT:    srli a5, a3, 1
-; RV32-NEXT:    or s2, a5, a4
+; RV32-NEXT:    srli a4, a3, 1
+; RV32-NEXT:    or s2, a4, a5
 ; RV32-NEXT:    srli a1, a1, 2
-; RV32-NEXT:    slli a1, a1, 31
-; RV32-NEXT:    srai s3, a1, 31
 ; RV32-NEXT:    srli a2, a2, 1
+; RV32-NEXT:    slli a3, a3, 31
+; RV32-NEXT:    slli a1, a1, 31
 ; RV32-NEXT:    slli a2, a2, 31
+; RV32-NEXT:    srai s3, a1, 31
 ; RV32-NEXT:    srai s4, a2, 31
-; RV32-NEXT:    slli a1, a3, 31
-; RV32-NEXT:    srai a1, a1, 31
+; RV32-NEXT:    srai a1, a3, 31
 ; RV32-NEXT:    li a2, 6
 ; RV32-NEXT:    li a3, 0
 ; RV32-NEXT:    call __moddi3
@@ -340,12 +340,12 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32-NEXT:    mv a1, s3
 ; RV32-NEXT:    call __moddi3
 ; RV32-NEXT:    or a2, s5, s6
-; RV32-NEXT:    snez a2, a2
 ; RV32-NEXT:    xori a0, a0, 2
+; RV32-NEXT:    xori a3, s2, 1
+; RV32-NEXT:    snez a2, a2
 ; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    or a1, a3, s4
 ; RV32-NEXT:    seqz a0, a0
-; RV32-NEXT:    xori a1, s2, 1
-; RV32-NEXT:    or a1, a1, s4
 ; RV32-NEXT:    seqz a1, a1
 ; RV32-NEXT:    neg a3, a2
 ; RV32-NEXT:    addi a1, a1, -1
@@ -355,8 +355,8 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32-NEXT:    or a2, a5, a2
 ; RV32-NEXT:    srli a5, a1, 31
 ; RV32-NEXT:    andi a1, a1, 1
-; RV32-NEXT:    slli a1, a1, 1
 ; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    slli a1, a1, 1
 ; RV32-NEXT:    or a0, a5, a0
 ; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    sw a3, 0(s0)
@@ -384,18 +384,18 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    mv s0, a0
 ; RV64-NEXT:    lbu a0, 12(a0)
-; RV64-NEXT:    lwu a1, 8(s0)
-; RV64-NEXT:    ld a2, 0(s0)
+; RV64-NEXT:    ld a1, 0(s0)
+; RV64-NEXT:    lwu a2, 8(s0)
 ; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    or a0, a1, a0
-; RV64-NEXT:    slli a0, a0, 29
-; RV64-NEXT:    srai s1, a0, 31
-; RV64-NEXT:    srli a0, a2, 2
-; RV64-NEXT:    slli a1, a1, 62
-; RV64-NEXT:    or a0, a1, a0
-; RV64-NEXT:    srai a0, a0, 31
-; RV64-NEXT:    slli a2, a2, 31
-; RV64-NEXT:    srai s2, a2, 31
+; RV64-NEXT:    srli a3, a1, 2
+; RV64-NEXT:    or a0, a2, a0
+; RV64-NEXT:    slli a2, a2, 62
+; RV64-NEXT:    slli a1, a1, 31
+; RV64-NEXT:    or a2, a2, a3
+; RV64-NEXT:    slli s1, a0, 29
+; RV64-NEXT:    srai a0, a2, 31
+; RV64-NEXT:    srai s1, s1, 31
+; RV64-NEXT:    srai s2, a1, 31
 ; RV64-NEXT:    li a1, 7
 ; RV64-NEXT:    call __moddi3
 ; RV64-NEXT:    mv s3, a0
@@ -410,32 +410,32 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64-NEXT:    mv a0, s2
 ; RV64-NEXT:    call __muldi3
 ; RV64-NEXT:    lui a1, %hi(.LCPI3_0)
+; RV64-NEXT:    addi s1, s1, -2
+; RV64-NEXT:    addi s3, s3, -1
 ; RV64-NEXT:    ld a1, %lo(.LCPI3_0)(a1)
+; RV64-NEXT:    seqz a2, s1
+; RV64-NEXT:    seqz a3, s3
+; RV64-NEXT:    addi a3, a3, -1
+; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    slli a4, a2, 2
+; RV64-NEXT:    slli a5, a3, 31
+; RV64-NEXT:    srli a5, a5, 62
 ; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    slli a2, a0, 63
+; RV64-NEXT:    or a4, a5, a4
+; RV64-NEXT:    slli a5, a0, 63
 ; RV64-NEXT:    srli a0, a0, 1
-; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    or a0, a0, a5
+; RV64-NEXT:    slli a2, a2, 29
+; RV64-NEXT:    slli a3, a3, 33
+; RV64-NEXT:    srli a2, a2, 61
 ; RV64-NEXT:    sltu a0, a1, a0
-; RV64-NEXT:    addi s1, s1, -2
-; RV64-NEXT:    seqz a1, s1
-; RV64-NEXT:    addi s3, s3, -1
-; RV64-NEXT:    seqz a2, s3
 ; RV64-NEXT:    neg a0, a0
-; RV64-NEXT:    addi a2, a2, -1
-; RV64-NEXT:    addi a1, a1, -1
-; RV64-NEXT:    slli a3, a1, 2
-; RV64-NEXT:    slli a4, a2, 31
-; RV64-NEXT:    srli a4, a4, 62
-; RV64-NEXT:    or a3, a4, a3
-; RV64-NEXT:    slli a1, a1, 29
-; RV64-NEXT:    srli a1, a1, 61
 ; RV64-NEXT:    slli a0, a0, 31
 ; RV64-NEXT:    srli a0, a0, 31
-; RV64-NEXT:    slli a2, a2, 33
-; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    or a0, a0, a3
 ; RV64-NEXT:    sd a0, 0(s0)
-; RV64-NEXT:    sw a3, 8(s0)
-; RV64-NEXT:    sb a1, 12(s0)
+; RV64-NEXT:    sw a4, 8(s0)
+; RV64-NEXT:    sb a2, 12(s0)
 ; RV64-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -462,18 +462,18 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32M-NEXT:    lw a0, 0(a0)
 ; RV32M-NEXT:    slli a4, a1, 30
 ; RV32M-NEXT:    srli s1, a2, 2
+; RV32M-NEXT:    slli a5, a2, 31
 ; RV32M-NEXT:    or s1, s1, a4
-; RV32M-NEXT:    slli a4, a2, 31
-; RV32M-NEXT:    srli a5, a3, 1
-; RV32M-NEXT:    or s2, a5, a4
+; RV32M-NEXT:    srli a4, a3, 1
+; RV32M-NEXT:    or s2, a4, a5
 ; RV32M-NEXT:    srli a1, a1, 2
-; RV32M-NEXT:    slli a1, a1, 31
-; RV32M-NEXT:    srai s3, a1, 31
 ; RV32M-NEXT:    srli a2, a2, 1
+; RV32M-NEXT:    slli a3, a3, 31
+; RV32M-NEXT:    slli a1, a1, 31
 ; RV32M-NEXT:    slli a2, a2, 31
+; RV32M-NEXT:    srai s3, a1, 31
 ; RV32M-NEXT:    srai s4, a2, 31
-; RV32M-NEXT:    slli a1, a3, 31
-; RV32M-NEXT:    srai a1, a1, 31
+; RV32M-NEXT:    srai a1, a3, 31
 ; RV32M-NEXT:    li a2, 6
 ; RV32M-NEXT:    li a3, 0
 ; RV32M-NEXT:    call __moddi3
@@ -492,12 +492,12 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32M-NEXT:    mv a1, s3
 ; RV32M-NEXT:    call __moddi3
 ; RV32M-NEXT:    or a2, s5, s6
-; RV32M-NEXT:    snez a2, a2
 ; RV32M-NEXT:    xori a0, a0, 2
+; RV32M-NEXT:    xori a3, s2, 1
+; RV32M-NEXT:    snez a2, a2
 ; RV32M-NEXT:    or a0, a0, a1
+; RV32M-NEXT:    or a1, a3, s4
 ; RV32M-NEXT:    seqz a0, a0
-; RV32M-NEXT:    xori a1, s2, 1
-; RV32M-NEXT:    or a1, a1, s4
 ; RV32M-NEXT:    seqz a1, a1
 ; RV32M-NEXT:    neg a3, a2
 ; RV32M-NEXT:    addi a1, a1, -1
@@ -507,8 +507,8 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32M-NEXT:    or a2, a5, a2
 ; RV32M-NEXT:    srli a5, a1, 31
 ; RV32M-NEXT:    andi a1, a1, 1
-; RV32M-NEXT:    slli a1, a1, 1
 ; RV32M-NEXT:    slli a0, a0, 2
+; RV32M-NEXT:    slli a1, a1, 1
 ; RV32M-NEXT:    or a0, a5, a0
 ; RV32M-NEXT:    or a0, a0, a1
 ; RV32M-NEXT:    sw a3, 0(s0)
@@ -531,62 +531,62 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV64M-NEXT:    ld a1, 0(a0)
 ; RV64M-NEXT:    lwu a2, 8(a0)
 ; RV64M-NEXT:    lbu a3, 12(a0)
-; RV64M-NEXT:    srli a4, a1, 2
-; RV64M-NEXT:    slli a5, a2, 62
-; RV64M-NEXT:    or a4, a5, a4
-; RV64M-NEXT:    srai a4, a4, 31
+; RV64M-NEXT:    lui a4, %hi(.LCPI3_0)
+; RV64M-NEXT:    lui a5, 699051
+; RV64M-NEXT:    addiw a5, a5, -1365
+; RV64M-NEXT:    slli a6, a5, 32
+; RV64M-NEXT:    add a5, a5, a6
+; RV64M-NEXT:    srli a6, a1, 2
+; RV64M-NEXT:    slli a7, a2, 62
+; RV64M-NEXT:    or a6, a7, a6
+; RV64M-NEXT:    lui a7, %hi(.LCPI3_1)
 ; RV64M-NEXT:    slli a3, a3, 32
 ; RV64M-NEXT:    or a2, a2, a3
-; RV64M-NEXT:    slli a2, a2, 29
-; RV64M-NEXT:    lui a3, %hi(.LCPI3_0)
-; RV64M-NEXT:    ld a3, %lo(.LCPI3_0)(a3)
-; RV64M-NEXT:    srai a2, a2, 31
+; RV64M-NEXT:    lui a3, %hi(.LCPI3_2)
+; RV64M-NEXT:    ld a4, %lo(.LCPI3_0)(a4)
+; RV64M-NEXT:    ld a7, %lo(.LCPI3_1)(a7)
+; RV64M-NEXT:    ld a3, %lo(.LCPI3_2)(a3)
 ; RV64M-NEXT:    slli a1, a1, 31
 ; RV64M-NEXT:    srai a1, a1, 31
-; RV64M-NEXT:    mulh a3, a2, a3
-; RV64M-NEXT:    srli a5, a3, 63
-; RV64M-NEXT:    srai a3, a3, 1
-; RV64M-NEXT:    add a3, a3, a5
-; RV64M-NEXT:    lui a5, %hi(.LCPI3_1)
-; RV64M-NEXT:    ld a5, %lo(.LCPI3_1)(a5)
-; RV64M-NEXT:    add a2, a2, a3
-; RV64M-NEXT:    slli a3, a3, 2
-; RV64M-NEXT:    add a2, a2, a3
-; RV64M-NEXT:    mulh a3, a4, a5
-; RV64M-NEXT:    srli a5, a3, 63
-; RV64M-NEXT:    srai a3, a3, 1
-; RV64M-NEXT:    add a3, a3, a5
-; RV64M-NEXT:    slli a5, a3, 3
-; RV64M-NEXT:    add a3, a4, a3
-; RV64M-NEXT:    sub a3, a3, a5
+; RV64M-NEXT:    srai a6, a6, 31
+; RV64M-NEXT:    slli a2, a2, 29
+; RV64M-NEXT:    mul a1, a1, a5
+; RV64M-NEXT:    srai a2, a2, 31
+; RV64M-NEXT:    mulh a5, a6, a7
+; RV64M-NEXT:    add a1, a1, a3
+; RV64M-NEXT:    mulh a4, a2, a4
+; RV64M-NEXT:    srli a7, a5, 63
+; RV64M-NEXT:    srai a5, a5, 1
+; RV64M-NEXT:    add a5, a5, a7
+; RV64M-NEXT:    slli a7, a1, 63
+; RV64M-NEXT:    srli a1, a1, 1
+; RV64M-NEXT:    or a1, a1, a7
+; RV64M-NEXT:    srli a7, a4, 63
+; RV64M-NEXT:    srai a4, a4, 1
+; RV64M-NEXT:    add a4, a4, a7
+; RV64M-NEXT:    sltu a1, a3, a1
+; RV64M-NEXT:    add a6, a6, a5
+; RV64M-NEXT:    slli a5, a5, 3
+; RV64M-NEXT:    add a2, a2, a4
+; RV64M-NEXT:    slli a4, a4, 2
+; RV64M-NEXT:    sub a3, a6, a5
+; RV64M-NEXT:    neg a1, a1
+; RV64M-NEXT:    add a2, a2, a4
 ; RV64M-NEXT:    addi a3, a3, -1
+; RV64M-NEXT:    slli a1, a1, 31
 ; RV64M-NEXT:    seqz a3, a3
-; RV64M-NEXT:    lui a4, 699051
-; RV64M-NEXT:    addiw a4, a4, -1365
-; RV64M-NEXT:    slli a5, a4, 32
-; RV64M-NEXT:    add a4, a4, a5
-; RV64M-NEXT:    lui a5, %hi(.LCPI3_2)
-; RV64M-NEXT:    ld a5, %lo(.LCPI3_2)(a5)
 ; RV64M-NEXT:    addi a2, a2, -2
+; RV64M-NEXT:    srli a1, a1, 31
 ; RV64M-NEXT:    seqz a2, a2
-; RV64M-NEXT:    mul a1, a1, a4
-; RV64M-NEXT:    add a1, a1, a5
-; RV64M-NEXT:    slli a4, a1, 63
-; RV64M-NEXT:    srli a1, a1, 1
-; RV64M-NEXT:    or a1, a1, a4
-; RV64M-NEXT:    sltu a1, a5, a1
-; RV64M-NEXT:    addi a2, a2, -1
 ; RV64M-NEXT:    addi a3, a3, -1
-; RV64M-NEXT:    neg a1, a1
+; RV64M-NEXT:    addi a2, a2, -1
 ; RV64M-NEXT:    slli a4, a3, 33
-; RV64M-NEXT:    slli a1, a1, 31
-; RV64M-NEXT:    srli a1, a1, 31
+; RV64M-NEXT:    slli a3, a3, 31
 ; RV64M-NEXT:    or a1, a1, a4
 ; RV64M-NEXT:    slli a4, a2, 2
-; RV64M-NEXT:    slli a3, a3, 31
 ; RV64M-NEXT:    srli a3, a3, 62
-; RV64M-NEXT:    or a3, a3, a4
 ; RV64M-NEXT:    slli a2, a2, 29
+; RV64M-NEXT:    or a3, a3, a4
 ; RV64M-NEXT:    srli a2, a2, 61
 ; RV64M-NEXT:    sd a1, 0(a0)
 ; RV64M-NEXT:    sw a3, 8(a0)
@@ -606,28 +606,28 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    slli a1, a1, 1
 ; RV32MV-NEXT:    sub sp, sp, a1
 ; RV32MV-NEXT:    mv s0, a0
-; RV32MV-NEXT:    lbu a1, 12(a0)
-; RV32MV-NEXT:    lw a2, 8(a0)
+; RV32MV-NEXT:    lw a1, 8(a0)
+; RV32MV-NEXT:    lbu a2, 12(a0)
 ; RV32MV-NEXT:    lw a3, 4(a0)
 ; RV32MV-NEXT:    lw a0, 0(a0)
-; RV32MV-NEXT:    slli a4, a1, 30
-; RV32MV-NEXT:    srli s1, a2, 2
-; RV32MV-NEXT:    or s1, s1, a4
-; RV32MV-NEXT:    slli a4, a2, 31
+; RV32MV-NEXT:    li a4, 1
+; RV32MV-NEXT:    slli a5, a2, 30
+; RV32MV-NEXT:    srli s1, a1, 2
+; RV32MV-NEXT:    slli a6, a1, 31
+; RV32MV-NEXT:    or s1, s1, a5
 ; RV32MV-NEXT:    srli a5, a3, 1
-; RV32MV-NEXT:    or s2, a5, a4
-; RV32MV-NEXT:    srli a1, a1, 2
-; RV32MV-NEXT:    slli a1, a1, 31
-; RV32MV-NEXT:    srai s3, a1, 31
-; RV32MV-NEXT:    srli a2, a2, 1
+; RV32MV-NEXT:    or s2, a5, a6
+; RV32MV-NEXT:    li a5, -1
+; RV32MV-NEXT:    srli a2, a2, 2
+; RV32MV-NEXT:    srli a1, a1, 1
+; RV32MV-NEXT:    slli a3, a3, 31
 ; RV32MV-NEXT:    slli a2, a2, 31
-; RV32MV-NEXT:    srai s4, a2, 31
-; RV32MV-NEXT:    slli a1, a3, 31
-; RV32MV-NEXT:    srai a1, a1, 31
-; RV32MV-NEXT:    li a2, 1
-; RV32MV-NEXT:    li a3, -1
-; RV32MV-NEXT:    sw a3, 16(sp)
-; RV32MV-NEXT:    sw a2, 20(sp)
+; RV32MV-NEXT:    slli a6, a1, 31
+; RV32MV-NEXT:    srai a1, a3, 31
+; RV32MV-NEXT:    srai s3, a2, 31
+; RV32MV-NEXT:    srai s4, a6, 31
+; RV32MV-NEXT:    sw a5, 16(sp)
+; RV32MV-NEXT:    sw a4, 20(sp)
 ; RV32MV-NEXT:    li a2, 6
 ; RV32MV-NEXT:    li a3, 0
 ; RV32MV-NEXT:    call __moddi3
@@ -681,33 +681,33 @@ define void @test_srem_vec(ptr %X) nounwind {
 ; RV32MV-NEXT:    vmerge.vim v8, v8, -1, v0
 ; RV32MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32MV-NEXT:    vslidedown.vi v10, v8, 1
+; RV32MV-NEXT:    vslidedown.vi v11, v8, 2
 ; RV32MV-NEXT:    vmv.x.s a0, v10
-; RV32MV-NEXT:    vslidedown.vi v10, v8, 2
-; RV32MV-NEXT:    vmv.x.s a1, v10
-; RV32MV-NEXT:    slli a2, a1, 1
-; RV32MV-NEXT:    sub a2, a2, a0
+; RV32MV-NEXT:    vmv.x.s a1, v11
 ; RV32MV-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
 ; RV32MV-NEXT:    vslidedown.vi v10, v8, 4
-; RV32MV-NEXT:    vmv.x.s a0, v10
-; RV32MV-NEXT:    srli a3, a0, 30
+; RV32MV-NEXT:    vmv.x.s a2, v10
 ; RV32MV-NEXT:    vslidedown.vi v10, v8, 5
-; RV32MV-NEXT:    vmv.x.s a4, v10
-; RV32MV-NEXT:    slli a4, a4, 2
-; RV32MV-NEXT:    or a3, a4, a3
+; RV32MV-NEXT:    vmv.x.s a3, v10
+; RV32MV-NEXT:    slli a4, a1, 1
+; RV32MV-NEXT:    sub a4, a4, a0
+; RV32MV-NEXT:    srli a0, a2, 30
+; RV32MV-NEXT:    slli a3, a3, 2
+; RV32MV-NEXT:    or a0, a3, a0
 ; RV32MV-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; RV32MV-NEXT:    vse32.v v8, (s0)
-; RV32MV-NEXT:    andi a3, a3, 7
-; RV32MV-NEXT:    srli a1, a1, 31
 ; RV32MV-NEXT:    vslidedown.vi v8, v8, 3
-; RV32MV-NEXT:    slli a0, a0, 2
-; RV32MV-NEXT:    or a0, a1, a0
-; RV32MV-NEXT:    vmv.x.s a1, v8
-; RV32MV-NEXT:    andi a1, a1, 1
-; RV32MV-NEXT:    slli a1, a1, 1
-; RV32MV-NEXT:    or a0, a0, a1
-; RV32MV-NEXT:    sw a2, 4(s0)
-; RV32MV-NEXT:    sw a0, 8(s0)
-; RV32MV-NEXT:    sb a3, 12(s0)
+; RV32MV-NEXT:    srli a1, a1, 31
+; RV32MV-NEXT:    slli a2, a2, 2
+; RV32MV-NEXT:    or a1, a1, a2
+; RV32MV-NEXT:    vmv.x.s a2, v8
+; RV32MV-NEXT:    andi a2, a2, 1
+; RV32MV-NEXT:    slli a2, a2, 1
+; RV32MV-NEXT:    andi a0, a0, 7
+; RV32MV-NEXT:    or a1, a1, a2
+; RV32MV-NEXT:    sw a4, 4(s0)
+; RV32MV-NEXT:    sw a1, 8(s0)
+; RV32MV-NEXT:    sb a0, 12(s0)
 ; RV32MV-NEXT:    csrr a0, vlenb
 ; RV32MV-NEXT:    slli a0, a0, 1
 ; RV32MV-NEXT:    add sp, sp, a0
@@ -722,79 +722,79 @@ define void @test_srem_vec(ptr %X) nounwind {
 ;
 ; RV64MV-LABEL: test_srem_vec:
 ; RV64MV:       # %bb.0:
-; RV64MV-NEXT:    lbu a1, 12(a0)
+; RV64MV-NEXT:    ld a1, 0(a0)
 ; RV64MV-NEXT:    lwu a2, 8(a0)
-; RV64MV-NEXT:    ld a3, 0(a0)
-; RV64MV-NEXT:    slli a1, a1, 32
-; RV64MV-NEXT:    or a1, a2, a1
-; RV64MV-NEXT:    slli a1, a1, 29
-; RV64MV-NEXT:    srai a1, a1, 31
-; RV64MV-NEXT:    srli a4, a3, 2
-; RV64MV-NEXT:    slli a2, a2, 62
-; RV64MV-NEXT:    or a2, a2, a4
+; RV64MV-NEXT:    lbu a3, 12(a0)
 ; RV64MV-NEXT:    lui a4, %hi(.LCPI3_0)
+; RV64MV-NEXT:    lui a5, %hi(.LCPI3_1)
+; RV64MV-NEXT:    lui a6, %hi(.LCPI3_2)
+; RV64MV-NEXT:    lui a7, 32
 ; RV64MV-NEXT:    ld a4, %lo(.LCPI3_0)(a4)
+; RV64MV-NEXT:    ld a5, %lo(.LCPI3_1)(a5)
+; RV64MV-NEXT:    ld a6, %lo(.LCPI3_2)(a6)
+; RV64MV-NEXT:    addi a7, a7, 256
+; RV64MV-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64MV-NEXT:    vmv.s.x v8, a7
+; RV64MV-NEXT:    slli a3, a3, 32
+; RV64MV-NEXT:    srli a7, a1, 2
+; RV64MV-NEXT:    or a3, a2, a3
+; RV64MV-NEXT:    slli a2, a2, 62
+; RV64MV-NEXT:    slli a1, a1, 31
+; RV64MV-NEXT:    or a2, a2, a7
+; RV64MV-NEXT:    srai a1, a1, 31
+; RV64MV-NEXT:    slli a3, a3, 29
 ; RV64MV-NEXT:    srai a2, a2, 31
-; RV64MV-NEXT:    slli a3, a3, 31
+; RV64MV-NEXT:    mulh a5, a1, a5
 ; RV64MV-NEXT:    srai a3, a3, 31
 ; RV64MV-NEXT:    mulh a4, a2, a4
-; RV64MV-NEXT:    srli a5, a4, 63
+; RV64MV-NEXT:    srli a7, a5, 63
+; RV64MV-NEXT:    add a5, a5, a7
+; RV64MV-NEXT:    srli a7, a4, 63
 ; RV64MV-NEXT:    srai a4, a4, 1
-; RV64MV-NEXT:    add a4, a4, a5
-; RV64MV-NEXT:    lui a5, %hi(.LCPI3_1)
-; RV64MV-NEXT:    ld a5, %lo(.LCPI3_1)(a5)
+; RV64MV-NEXT:    mulh a6, a3, a6
+; RV64MV-NEXT:    add a4, a4, a7
+; RV64MV-NEXT:    slli a7, a5, 3
+; RV64MV-NEXT:    slli a5, a5, 1
+; RV64MV-NEXT:    sub a5, a5, a7
+; RV64MV-NEXT:    srli a7, a6, 63
+; RV64MV-NEXT:    srai a6, a6, 1
+; RV64MV-NEXT:    add a6, a6, a7
 ; RV64MV-NEXT:    add a2, a2, a4
 ; RV64MV-NEXT:    slli a4, a4, 3
 ; RV64MV-NEXT:    sub a2, a2, a4
-; RV64MV-NEXT:    mulh a4, a3, a5
-; RV64MV-NEXT:    srli a5, a4, 63
-; RV64MV-NEXT:    add a4, a4, a5
-; RV64MV-NEXT:    slli a5, a4, 3
-; RV64MV-NEXT:    slli a4, a4, 1
-; RV64MV-NEXT:    sub a4, a4, a5
-; RV64MV-NEXT:    lui a5, %hi(.LCPI3_2)
-; RV64MV-NEXT:    ld a5, %lo(.LCPI3_2)(a5)
-; RV64MV-NEXT:    add a3, a3, a4
-; RV64MV-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; RV64MV-NEXT:    vmv.v.x v8, a3
+; RV64MV-NEXT:    add a1, a1, a5
+; RV64MV-NEXT:    li a4, -1
+; RV64MV-NEXT:    srli a4, a4, 31
+; RV64MV-NEXT:    vsext.vf8 v10, v8
+; RV64MV-NEXT:    add a3, a3, a6
+; RV64MV-NEXT:    slli a6, a6, 2
+; RV64MV-NEXT:    vmv.v.x v8, a1
+; RV64MV-NEXT:    add a3, a3, a6
 ; RV64MV-NEXT:    vslide1down.vx v8, v8, a2
-; RV64MV-NEXT:    mulh a2, a1, a5
-; RV64MV-NEXT:    srli a3, a2, 63
-; RV64MV-NEXT:    srai a2, a2, 1
-; RV64MV-NEXT:    add a2, a2, a3
-; RV64MV-NEXT:    slli a3, a2, 2
-; RV64MV-NEXT:    add a1, a1, a2
-; RV64MV-NEXT:    add a1, a1, a3
-; RV64MV-NEXT:    vslide1down.vx v8, v8, a1
+; RV64MV-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64MV-NEXT:    vslidedown.vi v8, v8, 1
-; RV64MV-NEXT:    li a1, -1
-; RV64MV-NEXT:    srli a1, a1, 31
-; RV64MV-NEXT:    vand.vx v8, v8, a1
-; RV64MV-NEXT:    lui a2, 32
-; RV64MV-NEXT:    addi a2, a2, 256
-; RV64MV-NEXT:    vmv.s.x v10, a2
-; RV64MV-NEXT:    vsext.vf8 v12, v10
-; RV64MV-NEXT:    vmsne.vv v0, v8, v12
+; RV64MV-NEXT:    vand.vx v8, v8, a4
+; RV64MV-NEXT:    vmsne.vv v0, v8, v10
 ; RV64MV-NEXT:    vmv.v.i v8, 0
 ; RV64MV-NEXT:    vmerge.vim v8, v8, -1, v0
 ; RV64MV-NEXT:    vslidedown.vi v10, v8, 2
-; RV64MV-NEXT:    vmv.x.s a2, v10
-; RV64MV-NEXT:    slli a3, a2, 31
-; RV64MV-NEXT:    srli a3, a3, 61
-; RV64MV-NEXT:    vmv.x.s a4, v8
-; RV64MV-NEXT:    and a1, a4, a1
+; RV64MV-NEXT:    vmv.x.s a1, v8
 ; RV64MV-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64MV-NEXT:    vslidedown.vi v8, v8, 1
-; RV64MV-NEXT:    vmv.x.s a4, v8
-; RV64MV-NEXT:    slli a5, a4, 33
-; RV64MV-NEXT:    or a1, a1, a5
+; RV64MV-NEXT:    vmv.x.s a2, v10
+; RV64MV-NEXT:    and a1, a1, a4
+; RV64MV-NEXT:    vmv.x.s a3, v8
+; RV64MV-NEXT:    slli a4, a2, 31
+; RV64MV-NEXT:    slli a5, a3, 33
 ; RV64MV-NEXT:    slli a2, a2, 2
-; RV64MV-NEXT:    slli a4, a4, 31
-; RV64MV-NEXT:    srli a4, a4, 62
-; RV64MV-NEXT:    or a2, a4, a2
+; RV64MV-NEXT:    slli a3, a3, 31
+; RV64MV-NEXT:    srli a4, a4, 61
+; RV64MV-NEXT:    or a1, a1, a5
+; RV64MV-NEXT:    srli a3, a3, 62
+; RV64MV-NEXT:    or a2, a3, a2
 ; RV64MV-NEXT:    sd a1, 0(a0)
 ; RV64MV-NEXT:    sw a2, 8(a0)
-; RV64MV-NEXT:    sb a3, 12(a0)
+; RV64MV-NEXT:    sb a4, 12(a0)
 ; RV64MV-NEXT:    ret
   %ld = load <3 x i33>, ptr %X
   %srem = srem <3 x i33> %ld, <i33 6, i33 7, i33 -5>

diff  --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
index c7e57021b90dc1..cf65d4e0cf8051 100644
--- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
@@ -53,50 +53,50 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: fold_srem_vec_1:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 0(a1)
-; RV32IM-NEXT:    lh a3, 4(a1)
+; RV32IM-NEXT:    lh a3, 0(a1)
+; RV32IM-NEXT:    lh a2, 4(a1)
 ; RV32IM-NEXT:    lh a4, 8(a1)
 ; RV32IM-NEXT:    lh a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 706409
+; RV32IM-NEXT:    lui a6, 507375
+; RV32IM-NEXT:    lui a7, 342392
+; RV32IM-NEXT:    lui t0, 780943
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a5, a2, a5
-; RV32IM-NEXT:    add a5, a5, a2
-; RV32IM-NEXT:    srli a6, a5, 31
-; RV32IM-NEXT:    srli a5, a5, 6
-; RV32IM-NEXT:    add a5, a5, a6
-; RV32IM-NEXT:    li a6, 95
-; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    lui a5, 507375
-; RV32IM-NEXT:    addi a5, a5, 1981
+; RV32IM-NEXT:    addi a6, a6, 1981
+; RV32IM-NEXT:    addi a7, a7, 669
+; RV32IM-NEXT:    addi t0, t0, 1809
 ; RV32IM-NEXT:    mulh a5, a3, a5
-; RV32IM-NEXT:    sub a5, a5, a3
-; RV32IM-NEXT:    srli a6, a5, 31
+; RV32IM-NEXT:    mulh a6, a2, a6
+; RV32IM-NEXT:    mulh a7, a4, a7
+; RV32IM-NEXT:    mulh t0, a1, t0
+; RV32IM-NEXT:    add a5, a5, a3
+; RV32IM-NEXT:    sub a6, a6, a2
+; RV32IM-NEXT:    srli t1, a7, 31
+; RV32IM-NEXT:    srli a7, a7, 5
+; RV32IM-NEXT:    add a7, a7, t1
+; RV32IM-NEXT:    srli t1, t0, 31
+; RV32IM-NEXT:    srli t0, t0, 8
+; RV32IM-NEXT:    add t0, t0, t1
+; RV32IM-NEXT:    srli t1, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
-; RV32IM-NEXT:    add a5, a5, a6
-; RV32IM-NEXT:    li a6, -124
-; RV32IM-NEXT:    mul a5, a5, a6
+; RV32IM-NEXT:    add a5, a5, t1
+; RV32IM-NEXT:    srli t1, a6, 31
+; RV32IM-NEXT:    srli a6, a6, 6
+; RV32IM-NEXT:    add a6, a6, t1
+; RV32IM-NEXT:    li t1, 98
+; RV32IM-NEXT:    mul a7, a7, t1
+; RV32IM-NEXT:    li t1, -1003
+; RV32IM-NEXT:    mul t0, t0, t1
+; RV32IM-NEXT:    li t1, 95
+; RV32IM-NEXT:    mul a5, a5, t1
+; RV32IM-NEXT:    li t1, -124
+; RV32IM-NEXT:    mul a6, a6, t1
+; RV32IM-NEXT:    sub a4, a4, a7
+; RV32IM-NEXT:    sub a1, a1, t0
 ; RV32IM-NEXT:    sub a3, a3, a5
-; RV32IM-NEXT:    lui a5, 342392
-; RV32IM-NEXT:    addi a5, a5, 669
-; RV32IM-NEXT:    mulh a5, a4, a5
-; RV32IM-NEXT:    srli a6, a5, 31
-; RV32IM-NEXT:    srli a5, a5, 5
-; RV32IM-NEXT:    add a5, a5, a6
-; RV32IM-NEXT:    li a6, 98
-; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a4, a4, a5
-; RV32IM-NEXT:    lui a5, 780943
-; RV32IM-NEXT:    addi a5, a5, 1809
-; RV32IM-NEXT:    mulh a5, a1, a5
-; RV32IM-NEXT:    srli a6, a5, 31
-; RV32IM-NEXT:    srli a5, a5, 8
-; RV32IM-NEXT:    add a5, a5, a6
-; RV32IM-NEXT:    li a6, -1003
-; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a1, a1, a5
-; RV32IM-NEXT:    sh a2, 0(a0)
-; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sub a2, a2, a6
+; RV32IM-NEXT:    sh a3, 0(a0)
+; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    sh a4, 4(a0)
 ; RV32IM-NEXT:    sh a1, 6(a0)
 ; RV32IM-NEXT:    ret
@@ -145,51 +145,51 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: fold_srem_vec_1:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lui a2, %hi(.LCPI0_0)
-; RV64IM-NEXT:    ld a2, %lo(.LCPI0_0)(a2)
 ; RV64IM-NEXT:    lh a3, 0(a1)
-; RV64IM-NEXT:    lh a4, 8(a1)
-; RV64IM-NEXT:    lh a5, 16(a1)
+; RV64IM-NEXT:    lh a2, 8(a1)
+; RV64IM-NEXT:    lh a4, 16(a1)
 ; RV64IM-NEXT:    lh a1, 24(a1)
-; RV64IM-NEXT:    mulh a2, a3, a2
-; RV64IM-NEXT:    add a2, a2, a3
-; RV64IM-NEXT:    srli a6, a2, 63
-; RV64IM-NEXT:    srli a2, a2, 6
-; RV64IM-NEXT:    add a2, a2, a6
+; RV64IM-NEXT:    lui a5, %hi(.LCPI0_0)
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_1)
+; RV64IM-NEXT:    lui a7, %hi(.LCPI0_2)
+; RV64IM-NEXT:    lui t0, %hi(.LCPI0_3)
+; RV64IM-NEXT:    ld a5, %lo(.LCPI0_0)(a5)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_1)(a6)
-; RV64IM-NEXT:    li a7, 95
-; RV64IM-NEXT:    mul a2, a2, a7
-; RV64IM-NEXT:    subw a3, a3, a2
-; RV64IM-NEXT:    mulh a2, a4, a6
-; RV64IM-NEXT:    sub a2, a2, a4
-; RV64IM-NEXT:    srli a6, a2, 63
-; RV64IM-NEXT:    srli a2, a2, 6
-; RV64IM-NEXT:    add a2, a2, a6
-; RV64IM-NEXT:    lui a6, %hi(.LCPI0_2)
-; RV64IM-NEXT:    ld a6, %lo(.LCPI0_2)(a6)
-; RV64IM-NEXT:    li a7, -124
-; RV64IM-NEXT:    mul a2, a2, a7
-; RV64IM-NEXT:    subw a4, a4, a2
-; RV64IM-NEXT:    mulh a2, a5, a6
-; RV64IM-NEXT:    srli a6, a2, 63
-; RV64IM-NEXT:    srli a2, a2, 5
-; RV64IM-NEXT:    add a2, a2, a6
-; RV64IM-NEXT:    lui a6, %hi(.LCPI0_3)
-; RV64IM-NEXT:    ld a6, %lo(.LCPI0_3)(a6)
-; RV64IM-NEXT:    li a7, 98
-; RV64IM-NEXT:    mul a2, a2, a7
-; RV64IM-NEXT:    subw a5, a5, a2
-; RV64IM-NEXT:    mulh a2, a1, a6
-; RV64IM-NEXT:    srli a6, a2, 63
-; RV64IM-NEXT:    srli a2, a2, 7
-; RV64IM-NEXT:    add a2, a2, a6
-; RV64IM-NEXT:    li a6, -1003
-; RV64IM-NEXT:    mul a2, a2, a6
-; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    ld a7, %lo(.LCPI0_2)(a7)
+; RV64IM-NEXT:    ld t0, %lo(.LCPI0_3)(t0)
+; RV64IM-NEXT:    mulh a5, a3, a5
+; RV64IM-NEXT:    mulh a6, a2, a6
+; RV64IM-NEXT:    mulh a7, a4, a7
+; RV64IM-NEXT:    mulh t0, a1, t0
+; RV64IM-NEXT:    add a5, a5, a3
+; RV64IM-NEXT:    sub a6, a6, a2
+; RV64IM-NEXT:    srli t1, a7, 63
+; RV64IM-NEXT:    srli a7, a7, 5
+; RV64IM-NEXT:    add a7, a7, t1
+; RV64IM-NEXT:    srli t1, t0, 63
+; RV64IM-NEXT:    srli t0, t0, 7
+; RV64IM-NEXT:    add t0, t0, t1
+; RV64IM-NEXT:    srli t1, a5, 63
+; RV64IM-NEXT:    srli a5, a5, 6
+; RV64IM-NEXT:    add a5, a5, t1
+; RV64IM-NEXT:    srli t1, a6, 63
+; RV64IM-NEXT:    srli a6, a6, 6
+; RV64IM-NEXT:    add a6, a6, t1
+; RV64IM-NEXT:    li t1, 98
+; RV64IM-NEXT:    mul a7, a7, t1
+; RV64IM-NEXT:    li t1, -1003
+; RV64IM-NEXT:    mul t0, t0, t1
+; RV64IM-NEXT:    li t1, 95
+; RV64IM-NEXT:    mul a5, a5, t1
+; RV64IM-NEXT:    li t1, -124
+; RV64IM-NEXT:    mul a6, a6, t1
+; RV64IM-NEXT:    subw a4, a4, a7
+; RV64IM-NEXT:    subw a1, a1, t0
+; RV64IM-NEXT:    subw a3, a3, a5
+; RV64IM-NEXT:    subw a2, a2, a6
 ; RV64IM-NEXT:    sh a3, 0(a0)
-; RV64IM-NEXT:    sh a4, 2(a0)
-; RV64IM-NEXT:    sh a5, 4(a0)
+; RV64IM-NEXT:    sh a2, 2(a0)
+; RV64IM-NEXT:    sh a4, 4(a0)
 ; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 95, i16 -124, i16 98, i16 -1003>
@@ -248,33 +248,33 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    lui a5, 706409
 ; RV32IM-NEXT:    addi a5, a5, 389
 ; RV32IM-NEXT:    mulh a6, a2, a5
-; RV32IM-NEXT:    add a6, a6, a2
-; RV32IM-NEXT:    srli a7, a6, 31
-; RV32IM-NEXT:    srli a6, a6, 6
-; RV32IM-NEXT:    add a6, a6, a7
-; RV32IM-NEXT:    li a7, 95
-; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a2, a2, a6
-; RV32IM-NEXT:    mulh a6, a3, a5
-; RV32IM-NEXT:    add a6, a6, a3
-; RV32IM-NEXT:    srli t0, a6, 31
-; RV32IM-NEXT:    srli a6, a6, 6
-; RV32IM-NEXT:    add a6, a6, t0
-; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a3, a3, a6
-; RV32IM-NEXT:    mulh a6, a4, a5
-; RV32IM-NEXT:    add a6, a6, a4
-; RV32IM-NEXT:    srli t0, a6, 31
-; RV32IM-NEXT:    srli a6, a6, 6
-; RV32IM-NEXT:    add a6, a6, t0
-; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a4, a4, a6
+; RV32IM-NEXT:    mulh a7, a3, a5
+; RV32IM-NEXT:    mulh t0, a4, a5
 ; RV32IM-NEXT:    mulh a5, a1, a5
+; RV32IM-NEXT:    add a6, a6, a2
+; RV32IM-NEXT:    add a7, a7, a3
+; RV32IM-NEXT:    add t0, t0, a4
 ; RV32IM-NEXT:    add a5, a5, a1
-; RV32IM-NEXT:    srli a6, a5, 31
+; RV32IM-NEXT:    srli t1, a6, 31
+; RV32IM-NEXT:    srli a6, a6, 6
+; RV32IM-NEXT:    add a6, a6, t1
+; RV32IM-NEXT:    srli t1, a7, 31
+; RV32IM-NEXT:    srli a7, a7, 6
+; RV32IM-NEXT:    add a7, a7, t1
+; RV32IM-NEXT:    srli t1, t0, 31
+; RV32IM-NEXT:    srli t0, t0, 6
+; RV32IM-NEXT:    add t0, t0, t1
+; RV32IM-NEXT:    srli t1, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
-; RV32IM-NEXT:    add a5, a5, a6
-; RV32IM-NEXT:    mul a5, a5, a7
+; RV32IM-NEXT:    add a5, a5, t1
+; RV32IM-NEXT:    li t1, 95
+; RV32IM-NEXT:    mul a6, a6, t1
+; RV32IM-NEXT:    mul a7, a7, t1
+; RV32IM-NEXT:    mul t0, t0, t1
+; RV32IM-NEXT:    mul a5, a5, t1
+; RV32IM-NEXT:    sub a2, a2, a6
+; RV32IM-NEXT:    sub a3, a3, a7
+; RV32IM-NEXT:    sub a4, a4, t0
 ; RV32IM-NEXT:    sub a1, a1, a5
 ; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    sh a3, 2(a0)
@@ -333,33 +333,33 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    lh a5, 16(a1)
 ; RV64IM-NEXT:    lh a1, 24(a1)
 ; RV64IM-NEXT:    mulh a6, a3, a2
-; RV64IM-NEXT:    add a6, a6, a3
-; RV64IM-NEXT:    srli a7, a6, 63
-; RV64IM-NEXT:    srli a6, a6, 6
-; RV64IM-NEXT:    add a6, a6, a7
-; RV64IM-NEXT:    li a7, 95
-; RV64IM-NEXT:    mul a6, a6, a7
-; RV64IM-NEXT:    subw a3, a3, a6
-; RV64IM-NEXT:    mulh a6, a4, a2
-; RV64IM-NEXT:    add a6, a6, a4
-; RV64IM-NEXT:    srli t0, a6, 63
-; RV64IM-NEXT:    srli a6, a6, 6
-; RV64IM-NEXT:    add a6, a6, t0
-; RV64IM-NEXT:    mul a6, a6, a7
-; RV64IM-NEXT:    subw a4, a4, a6
-; RV64IM-NEXT:    mulh a6, a5, a2
-; RV64IM-NEXT:    add a6, a6, a5
-; RV64IM-NEXT:    srli t0, a6, 63
-; RV64IM-NEXT:    srli a6, a6, 6
-; RV64IM-NEXT:    add a6, a6, t0
-; RV64IM-NEXT:    mul a6, a6, a7
-; RV64IM-NEXT:    subw a5, a5, a6
+; RV64IM-NEXT:    mulh a7, a4, a2
+; RV64IM-NEXT:    mulh t0, a5, a2
 ; RV64IM-NEXT:    mulh a2, a1, a2
+; RV64IM-NEXT:    add a6, a6, a3
+; RV64IM-NEXT:    add a7, a7, a4
+; RV64IM-NEXT:    add t0, t0, a5
 ; RV64IM-NEXT:    add a2, a2, a1
-; RV64IM-NEXT:    srli a6, a2, 63
+; RV64IM-NEXT:    srli t1, a6, 63
+; RV64IM-NEXT:    srli a6, a6, 6
+; RV64IM-NEXT:    add a6, a6, t1
+; RV64IM-NEXT:    srli t1, a7, 63
+; RV64IM-NEXT:    srli a7, a7, 6
+; RV64IM-NEXT:    add a7, a7, t1
+; RV64IM-NEXT:    srli t1, t0, 63
+; RV64IM-NEXT:    srli t0, t0, 6
+; RV64IM-NEXT:    add t0, t0, t1
+; RV64IM-NEXT:    srli t1, a2, 63
 ; RV64IM-NEXT:    srli a2, a2, 6
-; RV64IM-NEXT:    add a2, a2, a6
-; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    add a2, a2, t1
+; RV64IM-NEXT:    li t1, 95
+; RV64IM-NEXT:    mul a6, a6, t1
+; RV64IM-NEXT:    mul a7, a7, t1
+; RV64IM-NEXT:    mul t0, t0, t1
+; RV64IM-NEXT:    mul a2, a2, t1
+; RV64IM-NEXT:    subw a3, a3, a6
+; RV64IM-NEXT:    subw a4, a4, a7
+; RV64IM-NEXT:    subw a5, a5, t0
 ; RV64IM-NEXT:    subw a1, a1, a2
 ; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    sh a4, 2(a0)
@@ -445,49 +445,49 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: combine_srem_sdiv:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lh a2, 12(a1)
-; RV32IM-NEXT:    lh a3, 0(a1)
-; RV32IM-NEXT:    lh a4, 4(a1)
-; RV32IM-NEXT:    lh a1, 8(a1)
+; RV32IM-NEXT:    lh a2, 0(a1)
+; RV32IM-NEXT:    lh a3, 4(a1)
+; RV32IM-NEXT:    lh a4, 8(a1)
+; RV32IM-NEXT:    lh a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 706409
+; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    addi a5, a5, 389
-; RV32IM-NEXT:    mulh a6, a2, a5
-; RV32IM-NEXT:    add a6, a6, a2
-; RV32IM-NEXT:    srli a7, a6, 31
-; RV32IM-NEXT:    srai a6, a6, 6
-; RV32IM-NEXT:    add a6, a6, a7
-; RV32IM-NEXT:    li a7, 95
-; RV32IM-NEXT:    mul t0, a6, a7
-; RV32IM-NEXT:    mulh t1, a1, a5
-; RV32IM-NEXT:    add t1, t1, a1
-; RV32IM-NEXT:    srli t2, t1, 31
+; RV32IM-NEXT:    mulh a7, a1, a5
+; RV32IM-NEXT:    mulh t0, a4, a5
+; RV32IM-NEXT:    mulh t1, a3, a5
+; RV32IM-NEXT:    mulh a5, a2, a5
+; RV32IM-NEXT:    add a7, a7, a1
+; RV32IM-NEXT:    add t0, t0, a4
+; RV32IM-NEXT:    add t1, t1, a3
+; RV32IM-NEXT:    add a5, a5, a2
+; RV32IM-NEXT:    srli t2, a7, 31
+; RV32IM-NEXT:    srai a7, a7, 6
+; RV32IM-NEXT:    srli t3, t0, 31
+; RV32IM-NEXT:    srai t0, t0, 6
+; RV32IM-NEXT:    srli t4, t1, 31
 ; RV32IM-NEXT:    srai t1, t1, 6
-; RV32IM-NEXT:    add t1, t1, t2
-; RV32IM-NEXT:    mul t2, t1, a7
-; RV32IM-NEXT:    mulh t3, a4, a5
-; RV32IM-NEXT:    add t3, t3, a4
-; RV32IM-NEXT:    srli t4, t3, 31
-; RV32IM-NEXT:    srai t3, t3, 6
-; RV32IM-NEXT:    add t3, t3, t4
-; RV32IM-NEXT:    mul t4, t3, a7
-; RV32IM-NEXT:    mulh a5, a3, a5
-; RV32IM-NEXT:    add a5, a5, a3
 ; RV32IM-NEXT:    srli t5, a5, 31
 ; RV32IM-NEXT:    srai a5, a5, 6
+; RV32IM-NEXT:    add a7, a7, t2
+; RV32IM-NEXT:    add t0, t0, t3
+; RV32IM-NEXT:    add t1, t1, t4
 ; RV32IM-NEXT:    add a5, a5, t5
-; RV32IM-NEXT:    mul a7, a5, a7
-; RV32IM-NEXT:    add a3, a3, a5
-; RV32IM-NEXT:    sub a3, a3, a7
-; RV32IM-NEXT:    add a4, a4, t3
-; RV32IM-NEXT:    sub a4, a4, t4
-; RV32IM-NEXT:    add a1, a1, t1
+; RV32IM-NEXT:    mul t2, a7, a6
+; RV32IM-NEXT:    mul t3, t0, a6
+; RV32IM-NEXT:    mul t4, t1, a6
+; RV32IM-NEXT:    mul a6, a5, a6
+; RV32IM-NEXT:    add a2, a2, a5
+; RV32IM-NEXT:    add a3, a3, t1
+; RV32IM-NEXT:    add a4, a4, t0
+; RV32IM-NEXT:    add a1, a1, a7
+; RV32IM-NEXT:    sub a2, a2, a6
+; RV32IM-NEXT:    sub a3, a3, t4
+; RV32IM-NEXT:    sub a4, a4, t3
 ; RV32IM-NEXT:    sub a1, a1, t2
-; RV32IM-NEXT:    add a2, a2, a6
-; RV32IM-NEXT:    sub a2, a2, t0
-; RV32IM-NEXT:    sh a3, 0(a0)
-; RV32IM-NEXT:    sh a4, 2(a0)
-; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a2, 6(a0)
+; RV32IM-NEXT:    sh a2, 0(a0)
+; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
+; RV32IM-NEXT:    sh a1, 6(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: combine_srem_sdiv:
@@ -562,49 +562,49 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: combine_srem_sdiv:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lh a2, 24(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI2_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI2_0)(a3)
-; RV64IM-NEXT:    lh a4, 0(a1)
-; RV64IM-NEXT:    lh a5, 8(a1)
-; RV64IM-NEXT:    lh a1, 16(a1)
-; RV64IM-NEXT:    mulh a6, a2, a3
-; RV64IM-NEXT:    add a6, a6, a2
-; RV64IM-NEXT:    srli a7, a6, 63
-; RV64IM-NEXT:    srai a6, a6, 6
-; RV64IM-NEXT:    add a6, a6, a7
-; RV64IM-NEXT:    li a7, 95
-; RV64IM-NEXT:    mul t0, a6, a7
-; RV64IM-NEXT:    mulh t1, a1, a3
+; RV64IM-NEXT:    lh a2, 16(a1)
+; RV64IM-NEXT:    lh a3, 24(a1)
+; RV64IM-NEXT:    lui a4, %hi(.LCPI2_0)
+; RV64IM-NEXT:    ld a4, %lo(.LCPI2_0)(a4)
+; RV64IM-NEXT:    lh a5, 0(a1)
+; RV64IM-NEXT:    lh a1, 8(a1)
+; RV64IM-NEXT:    li a6, 95
+; RV64IM-NEXT:    mulh a7, a3, a4
+; RV64IM-NEXT:    mulh t0, a2, a4
+; RV64IM-NEXT:    mulh t1, a1, a4
+; RV64IM-NEXT:    mulh a4, a5, a4
+; RV64IM-NEXT:    add a7, a7, a3
+; RV64IM-NEXT:    add t0, t0, a2
 ; RV64IM-NEXT:    add t1, t1, a1
-; RV64IM-NEXT:    srli t2, t1, 63
+; RV64IM-NEXT:    add a4, a4, a5
+; RV64IM-NEXT:    srli t2, a7, 63
+; RV64IM-NEXT:    srai a7, a7, 6
+; RV64IM-NEXT:    srli t3, t0, 63
+; RV64IM-NEXT:    srai t0, t0, 6
+; RV64IM-NEXT:    srli t4, t1, 63
 ; RV64IM-NEXT:    srai t1, t1, 6
-; RV64IM-NEXT:    add t1, t1, t2
-; RV64IM-NEXT:    mul t2, t1, a7
-; RV64IM-NEXT:    mulh t3, a5, a3
-; RV64IM-NEXT:    add t3, t3, a5
-; RV64IM-NEXT:    srli t4, t3, 63
-; RV64IM-NEXT:    srai t3, t3, 6
-; RV64IM-NEXT:    add t3, t3, t4
-; RV64IM-NEXT:    mul t4, t3, a7
-; RV64IM-NEXT:    mulh a3, a4, a3
-; RV64IM-NEXT:    add a3, a3, a4
-; RV64IM-NEXT:    srli t5, a3, 63
-; RV64IM-NEXT:    srai a3, a3, 6
-; RV64IM-NEXT:    add a3, a3, t5
-; RV64IM-NEXT:    mul a7, a3, a7
-; RV64IM-NEXT:    add a3, a4, a3
-; RV64IM-NEXT:    subw a3, a3, a7
-; RV64IM-NEXT:    add a5, a5, t3
-; RV64IM-NEXT:    subw a4, a5, t4
+; RV64IM-NEXT:    srli t5, a4, 63
+; RV64IM-NEXT:    srai a4, a4, 6
+; RV64IM-NEXT:    add a7, a7, t2
+; RV64IM-NEXT:    add t0, t0, t3
+; RV64IM-NEXT:    add t1, t1, t4
+; RV64IM-NEXT:    add a4, a4, t5
+; RV64IM-NEXT:    mul t2, a7, a6
+; RV64IM-NEXT:    mul t3, t0, a6
+; RV64IM-NEXT:    mul t4, t1, a6
+; RV64IM-NEXT:    mul a6, a4, a6
+; RV64IM-NEXT:    add a4, a5, a4
 ; RV64IM-NEXT:    add a1, a1, t1
-; RV64IM-NEXT:    subw a1, a1, t2
-; RV64IM-NEXT:    add a2, a2, a6
-; RV64IM-NEXT:    subw a2, a2, t0
-; RV64IM-NEXT:    sh a3, 0(a0)
-; RV64IM-NEXT:    sh a4, 2(a0)
-; RV64IM-NEXT:    sh a1, 4(a0)
-; RV64IM-NEXT:    sh a2, 6(a0)
+; RV64IM-NEXT:    add a2, a2, t0
+; RV64IM-NEXT:    add a3, a3, a7
+; RV64IM-NEXT:    subw a4, a4, a6
+; RV64IM-NEXT:    subw a1, a1, t4
+; RV64IM-NEXT:    subw a2, a2, t3
+; RV64IM-NEXT:    subw a3, a3, t2
+; RV64IM-NEXT:    sh a4, 0(a0)
+; RV64IM-NEXT:    sh a1, 2(a0)
+; RV64IM-NEXT:    sh a2, 4(a0)
+; RV64IM-NEXT:    sh a3, 6(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
   %2 = sdiv <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
@@ -666,21 +666,21 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    srli a6, a5, 31
 ; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    add a5, a5, a6
+; RV32IM-NEXT:    srli a6, a1, 26
+; RV32IM-NEXT:    add a6, a1, a6
+; RV32IM-NEXT:    andi a6, a6, -64
+; RV32IM-NEXT:    sub a1, a1, a6
+; RV32IM-NEXT:    srli a6, a2, 27
+; RV32IM-NEXT:    add a6, a2, a6
+; RV32IM-NEXT:    andi a6, a6, -32
+; RV32IM-NEXT:    sub a2, a2, a6
+; RV32IM-NEXT:    srli a6, a3, 29
+; RV32IM-NEXT:    add a6, a3, a6
+; RV32IM-NEXT:    andi a6, a6, -8
+; RV32IM-NEXT:    sub a3, a3, a6
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
 ; RV32IM-NEXT:    sub a4, a4, a5
-; RV32IM-NEXT:    srli a5, a1, 26
-; RV32IM-NEXT:    add a5, a1, a5
-; RV32IM-NEXT:    andi a5, a5, -64
-; RV32IM-NEXT:    sub a1, a1, a5
-; RV32IM-NEXT:    srli a5, a2, 27
-; RV32IM-NEXT:    add a5, a2, a5
-; RV32IM-NEXT:    andi a5, a5, -32
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    srli a5, a3, 29
-; RV32IM-NEXT:    add a5, a3, a5
-; RV32IM-NEXT:    andi a5, a5, -8
-; RV32IM-NEXT:    sub a3, a3, a5
 ; RV32IM-NEXT:    sh a1, 0(a0)
 ; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    sh a3, 4(a0)
@@ -728,36 +728,36 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_srem_power_of_two:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lh a2, 24(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI3_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI3_0)(a3)
+; RV64IM-NEXT:    lh a2, 0(a1)
+; RV64IM-NEXT:    lh a3, 8(a1)
 ; RV64IM-NEXT:    lh a4, 16(a1)
-; RV64IM-NEXT:    lh a5, 8(a1)
-; RV64IM-NEXT:    lh a1, 0(a1)
-; RV64IM-NEXT:    mulh a3, a2, a3
-; RV64IM-NEXT:    add a3, a3, a2
-; RV64IM-NEXT:    srli a6, a3, 63
-; RV64IM-NEXT:    srli a3, a3, 6
-; RV64IM-NEXT:    add a3, a3, a6
+; RV64IM-NEXT:    lh a1, 24(a1)
+; RV64IM-NEXT:    lui a5, %hi(.LCPI3_0)
+; RV64IM-NEXT:    ld a5, %lo(.LCPI3_0)(a5)
+; RV64IM-NEXT:    srli a6, a2, 58
+; RV64IM-NEXT:    add a6, a2, a6
+; RV64IM-NEXT:    andi a6, a6, -64
+; RV64IM-NEXT:    subw a2, a2, a6
+; RV64IM-NEXT:    srli a6, a3, 59
+; RV64IM-NEXT:    add a6, a3, a6
+; RV64IM-NEXT:    andi a6, a6, -32
+; RV64IM-NEXT:    subw a3, a3, a6
+; RV64IM-NEXT:    srli a6, a4, 61
+; RV64IM-NEXT:    mulh a5, a1, a5
+; RV64IM-NEXT:    add a6, a4, a6
+; RV64IM-NEXT:    add a5, a5, a1
+; RV64IM-NEXT:    andi a6, a6, -8
+; RV64IM-NEXT:    subw a4, a4, a6
+; RV64IM-NEXT:    srli a6, a5, 63
+; RV64IM-NEXT:    srli a5, a5, 6
+; RV64IM-NEXT:    add a5, a5, a6
 ; RV64IM-NEXT:    li a6, 95
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    srli a3, a1, 58
-; RV64IM-NEXT:    add a3, a1, a3
-; RV64IM-NEXT:    andi a3, a3, -64
-; RV64IM-NEXT:    subw a1, a1, a3
-; RV64IM-NEXT:    srli a3, a5, 59
-; RV64IM-NEXT:    add a3, a5, a3
-; RV64IM-NEXT:    andi a3, a3, -32
-; RV64IM-NEXT:    subw a5, a5, a3
-; RV64IM-NEXT:    srli a3, a4, 61
-; RV64IM-NEXT:    add a3, a4, a3
-; RV64IM-NEXT:    andi a3, a3, -8
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    sh a1, 0(a0)
-; RV64IM-NEXT:    sh a5, 2(a0)
+; RV64IM-NEXT:    mul a5, a5, a6
+; RV64IM-NEXT:    subw a1, a1, a5
+; RV64IM-NEXT:    sh a2, 0(a0)
+; RV64IM-NEXT:    sh a3, 2(a0)
 ; RV64IM-NEXT:    sh a4, 4(a0)
-; RV64IM-NEXT:    sh a2, 6(a0)
+; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    ret
   %1 = srem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95>
   ret <4 x i16> %1
@@ -807,35 +807,35 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    lh a3, 8(a1)
 ; RV32IM-NEXT:    lh a1, 12(a1)
 ; RV32IM-NEXT:    lui a4, 820904
+; RV32IM-NEXT:    lui a5, 729444
+; RV32IM-NEXT:    lui a6, 395996
 ; RV32IM-NEXT:    addi a4, a4, -1903
+; RV32IM-NEXT:    addi a5, a5, 713
+; RV32IM-NEXT:    addi a6, a6, -2009
 ; RV32IM-NEXT:    mulh a4, a2, a4
+; RV32IM-NEXT:    mulh a5, a3, a5
+; RV32IM-NEXT:    mulh a6, a1, a6
 ; RV32IM-NEXT:    add a4, a4, a2
-; RV32IM-NEXT:    srli a5, a4, 31
+; RV32IM-NEXT:    add a5, a5, a3
+; RV32IM-NEXT:    srli a7, a6, 31
+; RV32IM-NEXT:    srli a6, a6, 11
+; RV32IM-NEXT:    add a6, a6, a7
+; RV32IM-NEXT:    srli a7, a4, 31
 ; RV32IM-NEXT:    srli a4, a4, 9
-; RV32IM-NEXT:    add a4, a4, a5
-; RV32IM-NEXT:    li a5, 654
-; RV32IM-NEXT:    mul a4, a4, a5
+; RV32IM-NEXT:    add a4, a4, a7
+; RV32IM-NEXT:    srli a7, a5, 31
+; RV32IM-NEXT:    srli a5, a5, 4
+; RV32IM-NEXT:    add a5, a5, a7
+; RV32IM-NEXT:    lui a7, 1
+; RV32IM-NEXT:    addi a7, a7, 1327
+; RV32IM-NEXT:    mul a6, a6, a7
+; RV32IM-NEXT:    li a7, 654
+; RV32IM-NEXT:    mul a4, a4, a7
+; RV32IM-NEXT:    li a7, 23
+; RV32IM-NEXT:    mul a5, a5, a7
+; RV32IM-NEXT:    sub a1, a1, a6
 ; RV32IM-NEXT:    sub a2, a2, a4
-; RV32IM-NEXT:    lui a4, 729444
-; RV32IM-NEXT:    addi a4, a4, 713
-; RV32IM-NEXT:    mulh a4, a3, a4
-; RV32IM-NEXT:    add a4, a4, a3
-; RV32IM-NEXT:    srli a5, a4, 31
-; RV32IM-NEXT:    srli a4, a4, 4
-; RV32IM-NEXT:    add a4, a4, a5
-; RV32IM-NEXT:    li a5, 23
-; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a3, a3, a4
-; RV32IM-NEXT:    lui a4, 395996
-; RV32IM-NEXT:    addi a4, a4, -2009
-; RV32IM-NEXT:    mulh a4, a1, a4
-; RV32IM-NEXT:    srli a5, a4, 31
-; RV32IM-NEXT:    srli a4, a4, 11
-; RV32IM-NEXT:    add a4, a4, a5
-; RV32IM-NEXT:    lui a5, 1
-; RV32IM-NEXT:    addi a5, a5, 1327
-; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a1, a1, a4
+; RV32IM-NEXT:    sub a3, a3, a5
 ; RV32IM-NEXT:    sh zero, 0(a0)
 ; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    sh a3, 4(a0)
@@ -880,40 +880,40 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_srem_one:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lui a2, %hi(.LCPI4_0)
-; RV64IM-NEXT:    ld a2, %lo(.LCPI4_0)(a2)
+; RV64IM-NEXT:    lh a2, 8(a1)
 ; RV64IM-NEXT:    lh a3, 16(a1)
-; RV64IM-NEXT:    lh a4, 8(a1)
 ; RV64IM-NEXT:    lh a1, 24(a1)
-; RV64IM-NEXT:    mulh a2, a3, a2
-; RV64IM-NEXT:    add a2, a2, a3
-; RV64IM-NEXT:    srli a5, a2, 63
-; RV64IM-NEXT:    srli a2, a2, 4
-; RV64IM-NEXT:    add a2, a2, a5
+; RV64IM-NEXT:    lui a4, %hi(.LCPI4_0)
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI4_1)
+; RV64IM-NEXT:    lui a6, %hi(.LCPI4_2)
+; RV64IM-NEXT:    ld a4, %lo(.LCPI4_0)(a4)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI4_1)(a5)
-; RV64IM-NEXT:    li a6, 23
-; RV64IM-NEXT:    mul a2, a2, a6
-; RV64IM-NEXT:    subw a3, a3, a2
-; RV64IM-NEXT:    mulh a2, a4, a5
-; RV64IM-NEXT:    srli a5, a2, 63
-; RV64IM-NEXT:    srli a2, a2, 8
-; RV64IM-NEXT:    add a2, a2, a5
-; RV64IM-NEXT:    lui a5, %hi(.LCPI4_2)
-; RV64IM-NEXT:    ld a5, %lo(.LCPI4_2)(a5)
-; RV64IM-NEXT:    li a6, 654
-; RV64IM-NEXT:    mul a2, a2, a6
-; RV64IM-NEXT:    subw a4, a4, a2
-; RV64IM-NEXT:    mulh a2, a1, a5
-; RV64IM-NEXT:    srli a5, a2, 63
-; RV64IM-NEXT:    srli a2, a2, 11
-; RV64IM-NEXT:    add a2, a2, a5
-; RV64IM-NEXT:    lui a5, 1
-; RV64IM-NEXT:    addi a5, a5, 1327
-; RV64IM-NEXT:    mul a2, a2, a5
-; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    ld a6, %lo(.LCPI4_2)(a6)
+; RV64IM-NEXT:    mulh a4, a3, a4
+; RV64IM-NEXT:    mulh a5, a2, a5
+; RV64IM-NEXT:    mulh a6, a1, a6
+; RV64IM-NEXT:    add a4, a4, a3
+; RV64IM-NEXT:    srli a7, a5, 63
+; RV64IM-NEXT:    srli a5, a5, 8
+; RV64IM-NEXT:    add a5, a5, a7
+; RV64IM-NEXT:    srli a7, a6, 63
+; RV64IM-NEXT:    srli a6, a6, 11
+; RV64IM-NEXT:    add a6, a6, a7
+; RV64IM-NEXT:    srli a7, a4, 63
+; RV64IM-NEXT:    srli a4, a4, 4
+; RV64IM-NEXT:    add a4, a4, a7
+; RV64IM-NEXT:    li a7, 654
+; RV64IM-NEXT:    mul a5, a5, a7
+; RV64IM-NEXT:    lui a7, 1
+; RV64IM-NEXT:    addi a7, a7, 1327
+; RV64IM-NEXT:    mul a6, a6, a7
+; RV64IM-NEXT:    li a7, 23
+; RV64IM-NEXT:    mul a4, a4, a7
+; RV64IM-NEXT:    subw a2, a2, a5
+; RV64IM-NEXT:    subw a1, a1, a6
+; RV64IM-NEXT:    subw a3, a3, a4
 ; RV64IM-NEXT:    sh zero, 0(a0)
-; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a2, 2(a0)
 ; RV64IM-NEXT:    sh a3, 4(a0)
 ; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    ret
@@ -931,8 +931,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lh a2, 4(a1)
 ; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lh a2, 4(a1)
 ; RV32I-NEXT:    lh a0, 8(a1)
 ; RV32I-NEXT:    lh s1, 12(a1)
 ; RV32I-NEXT:    srli a1, a2, 17
@@ -965,30 +965,30 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    lh a3, 8(a1)
 ; RV32IM-NEXT:    lh a1, 12(a1)
 ; RV32IM-NEXT:    lui a4, 729444
+; RV32IM-NEXT:    lui a5, 395996
+; RV32IM-NEXT:    lui a6, 8
 ; RV32IM-NEXT:    addi a4, a4, 713
+; RV32IM-NEXT:    addi a5, a5, -2009
 ; RV32IM-NEXT:    mulh a4, a3, a4
+; RV32IM-NEXT:    mulh a5, a1, a5
+; RV32IM-NEXT:    srli a7, a5, 31
+; RV32IM-NEXT:    srli a5, a5, 11
+; RV32IM-NEXT:    add a5, a5, a7
+; RV32IM-NEXT:    srli a7, a2, 17
 ; RV32IM-NEXT:    add a4, a4, a3
-; RV32IM-NEXT:    srli a5, a4, 31
+; RV32IM-NEXT:    add a7, a2, a7
+; RV32IM-NEXT:    and a6, a7, a6
+; RV32IM-NEXT:    srli a7, a4, 31
 ; RV32IM-NEXT:    srli a4, a4, 4
-; RV32IM-NEXT:    add a4, a4, a5
-; RV32IM-NEXT:    li a5, 23
-; RV32IM-NEXT:    mul a4, a4, a5
+; RV32IM-NEXT:    add a4, a4, a7
+; RV32IM-NEXT:    lui a7, 1
+; RV32IM-NEXT:    addi a7, a7, 1327
+; RV32IM-NEXT:    mul a5, a5, a7
+; RV32IM-NEXT:    sub a2, a2, a6
+; RV32IM-NEXT:    li a6, 23
+; RV32IM-NEXT:    mul a4, a4, a6
+; RV32IM-NEXT:    sub a1, a1, a5
 ; RV32IM-NEXT:    sub a3, a3, a4
-; RV32IM-NEXT:    lui a4, 395996
-; RV32IM-NEXT:    addi a4, a4, -2009
-; RV32IM-NEXT:    mulh a4, a1, a4
-; RV32IM-NEXT:    srli a5, a4, 31
-; RV32IM-NEXT:    srli a4, a4, 11
-; RV32IM-NEXT:    add a4, a4, a5
-; RV32IM-NEXT:    lui a5, 1
-; RV32IM-NEXT:    addi a5, a5, 1327
-; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a1, a1, a4
-; RV32IM-NEXT:    srli a4, a2, 17
-; RV32IM-NEXT:    add a4, a2, a4
-; RV32IM-NEXT:    lui a5, 8
-; RV32IM-NEXT:    and a4, a4, a5
-; RV32IM-NEXT:    sub a2, a2, a4
 ; RV32IM-NEXT:    sh zero, 0(a0)
 ; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    sh a3, 4(a0)
@@ -1003,8 +1003,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    sd s1, 24(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s2, 16(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    sd s3, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    lh a2, 8(a1)
 ; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    lh a2, 8(a1)
 ; RV64I-NEXT:    lh a0, 16(a1)
 ; RV64I-NEXT:    lh s1, 24(a1)
 ; RV64I-NEXT:    srli a1, a2, 49
@@ -1033,36 +1033,36 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_urem_i16_smax:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lui a2, %hi(.LCPI5_0)
-; RV64IM-NEXT:    ld a2, %lo(.LCPI5_0)(a2)
+; RV64IM-NEXT:    lh a2, 8(a1)
 ; RV64IM-NEXT:    lh a3, 16(a1)
-; RV64IM-NEXT:    lh a4, 8(a1)
 ; RV64IM-NEXT:    lh a1, 24(a1)
-; RV64IM-NEXT:    mulh a2, a3, a2
-; RV64IM-NEXT:    add a2, a2, a3
-; RV64IM-NEXT:    srli a5, a2, 63
-; RV64IM-NEXT:    srli a2, a2, 4
-; RV64IM-NEXT:    add a2, a2, a5
+; RV64IM-NEXT:    lui a4, %hi(.LCPI5_0)
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI5_1)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI5_1)(a5)
+; RV64IM-NEXT:    lui a6, 8
+; RV64IM-NEXT:    ld a4, %lo(.LCPI5_0)(a4)
+; RV64IM-NEXT:    srli a7, a2, 49
+; RV64IM-NEXT:    mulh a5, a1, a5
+; RV64IM-NEXT:    add a7, a2, a7
+; RV64IM-NEXT:    and a6, a7, a6
+; RV64IM-NEXT:    srli a7, a5, 63
+; RV64IM-NEXT:    srli a5, a5, 11
+; RV64IM-NEXT:    add a5, a5, a7
+; RV64IM-NEXT:    mulh a4, a3, a4
+; RV64IM-NEXT:    add a4, a4, a3
+; RV64IM-NEXT:    subw a2, a2, a6
+; RV64IM-NEXT:    srli a6, a4, 63
+; RV64IM-NEXT:    srli a4, a4, 4
+; RV64IM-NEXT:    add a4, a4, a6
+; RV64IM-NEXT:    lui a6, 1
+; RV64IM-NEXT:    addi a6, a6, 1327
+; RV64IM-NEXT:    mul a5, a5, a6
 ; RV64IM-NEXT:    li a6, 23
-; RV64IM-NEXT:    mul a2, a2, a6
-; RV64IM-NEXT:    subw a3, a3, a2
-; RV64IM-NEXT:    mulh a2, a1, a5
-; RV64IM-NEXT:    srli a5, a2, 63
-; RV64IM-NEXT:    srli a2, a2, 11
-; RV64IM-NEXT:    add a2, a2, a5
-; RV64IM-NEXT:    lui a5, 1
-; RV64IM-NEXT:    addi a5, a5, 1327
-; RV64IM-NEXT:    mul a2, a2, a5
-; RV64IM-NEXT:    subw a1, a1, a2
-; RV64IM-NEXT:    srli a2, a4, 49
-; RV64IM-NEXT:    add a2, a4, a2
-; RV64IM-NEXT:    lui a5, 8
-; RV64IM-NEXT:    and a2, a2, a5
-; RV64IM-NEXT:    subw a4, a4, a2
+; RV64IM-NEXT:    mul a4, a4, a6
+; RV64IM-NEXT:    subw a1, a1, a5
+; RV64IM-NEXT:    subw a3, a3, a4
 ; RV64IM-NEXT:    sh zero, 0(a0)
-; RV64IM-NEXT:    sh a4, 2(a0)
+; RV64IM-NEXT:    sh a2, 2(a0)
 ; RV64IM-NEXT:    sh a3, 4(a0)
 ; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    ret
@@ -1250,40 +1250,40 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_srem_i64:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lui a2, %hi(.LCPI6_0)
-; RV64IM-NEXT:    ld a2, %lo(.LCPI6_0)(a2)
+; RV64IM-NEXT:    ld a2, 8(a1)
 ; RV64IM-NEXT:    ld a3, 16(a1)
-; RV64IM-NEXT:    ld a4, 8(a1)
 ; RV64IM-NEXT:    ld a1, 24(a1)
-; RV64IM-NEXT:    mulh a2, a3, a2
-; RV64IM-NEXT:    add a2, a2, a3
-; RV64IM-NEXT:    srli a5, a2, 63
-; RV64IM-NEXT:    srai a2, a2, 4
-; RV64IM-NEXT:    add a2, a2, a5
+; RV64IM-NEXT:    lui a4, %hi(.LCPI6_0)
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI6_1)
+; RV64IM-NEXT:    lui a6, %hi(.LCPI6_2)
+; RV64IM-NEXT:    ld a4, %lo(.LCPI6_0)(a4)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI6_1)(a5)
-; RV64IM-NEXT:    li a6, 23
-; RV64IM-NEXT:    mul a2, a2, a6
-; RV64IM-NEXT:    sub a3, a3, a2
-; RV64IM-NEXT:    mulh a2, a4, a5
-; RV64IM-NEXT:    srli a5, a2, 63
-; RV64IM-NEXT:    srai a2, a2, 8
-; RV64IM-NEXT:    add a2, a2, a5
-; RV64IM-NEXT:    lui a5, %hi(.LCPI6_2)
-; RV64IM-NEXT:    ld a5, %lo(.LCPI6_2)(a5)
-; RV64IM-NEXT:    li a6, 654
-; RV64IM-NEXT:    mul a2, a2, a6
-; RV64IM-NEXT:    sub a4, a4, a2
-; RV64IM-NEXT:    mulh a2, a1, a5
-; RV64IM-NEXT:    srli a5, a2, 63
-; RV64IM-NEXT:    srai a2, a2, 11
-; RV64IM-NEXT:    add a2, a2, a5
-; RV64IM-NEXT:    lui a5, 1
-; RV64IM-NEXT:    addiw a5, a5, 1327
-; RV64IM-NEXT:    mul a2, a2, a5
-; RV64IM-NEXT:    sub a1, a1, a2
+; RV64IM-NEXT:    ld a6, %lo(.LCPI6_2)(a6)
+; RV64IM-NEXT:    mulh a4, a3, a4
+; RV64IM-NEXT:    mulh a5, a2, a5
+; RV64IM-NEXT:    mulh a6, a1, a6
+; RV64IM-NEXT:    add a4, a4, a3
+; RV64IM-NEXT:    srli a7, a5, 63
+; RV64IM-NEXT:    srai a5, a5, 8
+; RV64IM-NEXT:    add a5, a5, a7
+; RV64IM-NEXT:    srli a7, a6, 63
+; RV64IM-NEXT:    srai a6, a6, 11
+; RV64IM-NEXT:    add a6, a6, a7
+; RV64IM-NEXT:    srli a7, a4, 63
+; RV64IM-NEXT:    srai a4, a4, 4
+; RV64IM-NEXT:    add a4, a4, a7
+; RV64IM-NEXT:    li a7, 654
+; RV64IM-NEXT:    mul a5, a5, a7
+; RV64IM-NEXT:    lui a7, 1
+; RV64IM-NEXT:    addiw a7, a7, 1327
+; RV64IM-NEXT:    mul a6, a6, a7
+; RV64IM-NEXT:    li a7, 23
+; RV64IM-NEXT:    mul a4, a4, a7
+; RV64IM-NEXT:    sub a2, a2, a5
+; RV64IM-NEXT:    sub a1, a1, a6
+; RV64IM-NEXT:    sub a3, a3, a4
 ; RV64IM-NEXT:    sd zero, 0(a0)
-; RV64IM-NEXT:    sd a4, 8(a0)
+; RV64IM-NEXT:    sd a2, 8(a0)
 ; RV64IM-NEXT:    sd a3, 16(a0)
 ; RV64IM-NEXT:    sd a1, 24(a0)
 ; RV64IM-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
index d113afa769931a..0499992b71778c 100644
--- a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
@@ -44,11 +44,11 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    sext.w a0, a0
 ; RV64IZbb-NEXT:    mulw a1, a1, a2
+; RV64IZbb-NEXT:    lui a2, 524288
 ; RV64IZbb-NEXT:    sub a0, a0, a1
-; RV64IZbb-NEXT:    lui a1, 524288
-; RV64IZbb-NEXT:    addiw a2, a1, -1
-; RV64IZbb-NEXT:    min a0, a0, a2
-; RV64IZbb-NEXT:    max a0, a0, a1
+; RV64IZbb-NEXT:    addiw a1, a2, -1
+; RV64IZbb-NEXT:    min a0, a0, a1
+; RV64IZbb-NEXT:    max a0, a0, a2
 ; RV64IZbb-NEXT:    ret
   %a = mul i32 %y, %z
   %tmp = call i32 @llvm.ssub.sat.i32(i32 %x, i32 %a)
@@ -98,13 +98,13 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; RV32I-LABEL: func16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 16
-; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    mul a1, a1, a2
+; RV32I-NEXT:    lui a2, 8
+; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    slli a1, a1, 16
 ; RV32I-NEXT:    srai a1, a1, 16
 ; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lui a1, 8
-; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    addi a1, a2, -1
 ; RV32I-NEXT:    bge a0, a1, .LBB2_3
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    lui a1, 1048568
@@ -122,13 +122,13 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; RV64I-LABEL: func16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    lui a2, 8
+; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    slli a1, a1, 48
 ; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    lui a1, 8
-; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    addiw a1, a2, -1
 ; RV64I-NEXT:    bge a0, a1, .LBB2_3
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    lui a1, 1048568
@@ -147,11 +147,11 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; RV32IZbb:       # %bb.0:
 ; RV32IZbb-NEXT:    sext.h a0, a0
 ; RV32IZbb-NEXT:    mul a1, a1, a2
+; RV32IZbb-NEXT:    lui a2, 8
 ; RV32IZbb-NEXT:    sext.h a1, a1
+; RV32IZbb-NEXT:    addi a2, a2, -1
 ; RV32IZbb-NEXT:    sub a0, a0, a1
-; RV32IZbb-NEXT:    lui a1, 8
-; RV32IZbb-NEXT:    addi a1, a1, -1
-; RV32IZbb-NEXT:    min a0, a0, a1
+; RV32IZbb-NEXT:    min a0, a0, a2
 ; RV32IZbb-NEXT:    lui a1, 1048568
 ; RV32IZbb-NEXT:    max a0, a0, a1
 ; RV32IZbb-NEXT:    ret
@@ -160,11 +160,11 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    sext.h a0, a0
 ; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    lui a2, 8
 ; RV64IZbb-NEXT:    sext.h a1, a1
+; RV64IZbb-NEXT:    addiw a2, a2, -1
 ; RV64IZbb-NEXT:    sub a0, a0, a1
-; RV64IZbb-NEXT:    lui a1, 8
-; RV64IZbb-NEXT:    addiw a1, a1, -1
-; RV64IZbb-NEXT:    min a0, a0, a1
+; RV64IZbb-NEXT:    min a0, a0, a2
 ; RV64IZbb-NEXT:    lui a1, 1048568
 ; RV64IZbb-NEXT:    max a0, a0, a1
 ; RV64IZbb-NEXT:    ret
@@ -177,8 +177,8 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 ; RV32I-LABEL: func8:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    mul a1, a1, a2
+; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 24
 ; RV32I-NEXT:    srai a1, a1, 24
 ; RV32I-NEXT:    sub a0, a0, a1
@@ -200,8 +200,8 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 ; RV64I-LABEL: func8:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 56
-; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    slli a1, a1, 56
 ; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    sub a0, a0, a1
@@ -224,10 +224,10 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 ; RV32IZbb:       # %bb.0:
 ; RV32IZbb-NEXT:    sext.b a0, a0
 ; RV32IZbb-NEXT:    mul a1, a1, a2
+; RV32IZbb-NEXT:    li a2, 127
 ; RV32IZbb-NEXT:    sext.b a1, a1
 ; RV32IZbb-NEXT:    sub a0, a0, a1
-; RV32IZbb-NEXT:    li a1, 127
-; RV32IZbb-NEXT:    min a0, a0, a1
+; RV32IZbb-NEXT:    min a0, a0, a2
 ; RV32IZbb-NEXT:    li a1, -128
 ; RV32IZbb-NEXT:    max a0, a0, a1
 ; RV32IZbb-NEXT:    ret
@@ -236,10 +236,10 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    sext.b a0, a0
 ; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    li a2, 127
 ; RV64IZbb-NEXT:    sext.b a1, a1
 ; RV64IZbb-NEXT:    sub a0, a0, a1
-; RV64IZbb-NEXT:    li a1, 127
-; RV64IZbb-NEXT:    min a0, a0, a1
+; RV64IZbb-NEXT:    min a0, a0, a2
 ; RV64IZbb-NEXT:    li a1, -128
 ; RV64IZbb-NEXT:    max a0, a0, a1
 ; RV64IZbb-NEXT:    ret
@@ -252,8 +252,8 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; RV32I-LABEL: func4:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    slli a0, a0, 28
-; RV32I-NEXT:    srai a0, a0, 28
 ; RV32I-NEXT:    mul a1, a1, a2
+; RV32I-NEXT:    srai a0, a0, 28
 ; RV32I-NEXT:    slli a1, a1, 28
 ; RV32I-NEXT:    srai a1, a1, 28
 ; RV32I-NEXT:    sub a0, a0, a1
@@ -275,8 +275,8 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; RV64I-LABEL: func4:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 60
-; RV64I-NEXT:    srai a0, a0, 60
 ; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    srai a0, a0, 60
 ; RV64I-NEXT:    slli a1, a1, 60
 ; RV64I-NEXT:    srai a1, a1, 60
 ; RV64I-NEXT:    sub a0, a0, a1
@@ -298,13 +298,13 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; RV32IZbb-LABEL: func4:
 ; RV32IZbb:       # %bb.0:
 ; RV32IZbb-NEXT:    slli a0, a0, 28
-; RV32IZbb-NEXT:    srai a0, a0, 28
 ; RV32IZbb-NEXT:    mul a1, a1, a2
+; RV32IZbb-NEXT:    li a2, 7
+; RV32IZbb-NEXT:    srai a0, a0, 28
 ; RV32IZbb-NEXT:    slli a1, a1, 28
 ; RV32IZbb-NEXT:    srai a1, a1, 28
 ; RV32IZbb-NEXT:    sub a0, a0, a1
-; RV32IZbb-NEXT:    li a1, 7
-; RV32IZbb-NEXT:    min a0, a0, a1
+; RV32IZbb-NEXT:    min a0, a0, a2
 ; RV32IZbb-NEXT:    li a1, -8
 ; RV32IZbb-NEXT:    max a0, a0, a1
 ; RV32IZbb-NEXT:    ret
@@ -312,13 +312,13 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; RV64IZbb-LABEL: func4:
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    slli a0, a0, 60
-; RV64IZbb-NEXT:    srai a0, a0, 60
 ; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    li a2, 7
+; RV64IZbb-NEXT:    srai a0, a0, 60
 ; RV64IZbb-NEXT:    slli a1, a1, 60
 ; RV64IZbb-NEXT:    srai a1, a1, 60
 ; RV64IZbb-NEXT:    sub a0, a0, a1
-; RV64IZbb-NEXT:    li a1, 7
-; RV64IZbb-NEXT:    min a0, a0, a1
+; RV64IZbb-NEXT:    min a0, a0, a2
 ; RV64IZbb-NEXT:    li a1, -8
 ; RV64IZbb-NEXT:    max a0, a0, a1
 ; RV64IZbb-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/stack-store-check.ll b/llvm/test/CodeGen/RISCV/stack-store-check.ll
index 052ccbf6e06f36..cd1aebfea5ce4e 100644
--- a/llvm/test/CodeGen/RISCV/stack-store-check.ll
+++ b/llvm/test/CodeGen/RISCV/stack-store-check.ll
@@ -266,13 +266,13 @@ define void @main() local_unnamed_addr nounwind {
 ; CHECK-NEXT:    lw a4, 156(sp)
 ; CHECK-NEXT:    lw a5, 160(sp)
 ; CHECK-NEXT:    lw a6, 164(sp)
-; CHECK-NEXT:    lui a0, 786400
+; CHECK-NEXT:    lui a2, 786400
+; CHECK-NEXT:    addi a0, sp, 104
+; CHECK-NEXT:    addi a1, sp, 88
 ; CHECK-NEXT:    sw zero, 72(sp)
 ; CHECK-NEXT:    sw zero, 76(sp)
 ; CHECK-NEXT:    sw zero, 80(sp)
-; CHECK-NEXT:    sw a0, 84(sp)
-; CHECK-NEXT:    addi a0, sp, 104
-; CHECK-NEXT:    addi a1, sp, 88
+; CHECK-NEXT:    sw a2, 84(sp)
 ; CHECK-NEXT:    addi a2, sp, 72
 ; CHECK-NEXT:    sw a3, 88(sp)
 ; CHECK-NEXT:    sw a4, 92(sp)

diff  --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll
index f7ef01b0958d87..366b37ac5d4720 100644
--- a/llvm/test/CodeGen/RISCV/tail-calls.ll
+++ b/llvm/test/CodeGen/RISCV/tail-calls.ll
@@ -43,9 +43,9 @@ define void @caller_extern(ptr %src) optsize {
 ; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
 ; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi1:
 ; CHECK-LARGE-ZICFILP-NEXT:    auipc a1, %pcrel_hi(.LCPI1_0)
-; CHECK-LARGE-ZICFILP-NEXT:    lw a1, %pcrel_lo(.Lpcrel_hi1)(a1)
 ; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi2:
 ; CHECK-LARGE-ZICFILP-NEXT:    auipc a2, %pcrel_hi(.LCPI1_1)
+; CHECK-LARGE-ZICFILP-NEXT:    lw a1, %pcrel_lo(.Lpcrel_hi1)(a1)
 ; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi2)(a2)
 ; CHECK-LARGE-ZICFILP-NEXT:    li a2, 7
 ; CHECK-LARGE-ZICFILP-NEXT:    mv a3, a0
@@ -75,9 +75,9 @@ define void @caller_extern_pgso(ptr %src) !prof !14 {
 ; CHECK-LARGE-ZICFILP-NEXT:    lpad 0
 ; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi3:
 ; CHECK-LARGE-ZICFILP-NEXT:    auipc a1, %pcrel_hi(.LCPI2_0)
-; CHECK-LARGE-ZICFILP-NEXT:    lw a1, %pcrel_lo(.Lpcrel_hi3)(a1)
 ; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi4:
 ; CHECK-LARGE-ZICFILP-NEXT:    auipc a2, %pcrel_hi(.LCPI2_1)
+; CHECK-LARGE-ZICFILP-NEXT:    lw a1, %pcrel_lo(.Lpcrel_hi3)(a1)
 ; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi4)(a2)
 ; CHECK-LARGE-ZICFILP-NEXT:    li a2, 7
 ; CHECK-LARGE-ZICFILP-NEXT:    mv a3, a0
@@ -465,9 +465,9 @@ define void @caller_nostruct() nounwind {
 ; CHECK-LARGE-ZICFILP-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
 ; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi13:
 ; CHECK-LARGE-ZICFILP-NEXT:    auipc a0, %pcrel_hi(.LCPI11_0)
-; CHECK-LARGE-ZICFILP-NEXT:    lw a0, %pcrel_lo(.Lpcrel_hi13)(a0)
 ; CHECK-LARGE-ZICFILP-NEXT:  .Lpcrel_hi14:
 ; CHECK-LARGE-ZICFILP-NEXT:    auipc a1, %pcrel_hi(.LCPI11_1)
+; CHECK-LARGE-ZICFILP-NEXT:    lw a0, %pcrel_lo(.Lpcrel_hi13)(a0)
 ; CHECK-LARGE-ZICFILP-NEXT:    lw t2, %pcrel_lo(.Lpcrel_hi14)(a1)
 ; CHECK-LARGE-ZICFILP-NEXT:    jalr t2
 ; CHECK-LARGE-ZICFILP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload

diff  --git a/llvm/test/CodeGen/RISCV/trunc-nsw-nuw.ll b/llvm/test/CodeGen/RISCV/trunc-nsw-nuw.ll
index f270775adcc155..9f81ff8c8d31a1 100644
--- a/llvm/test/CodeGen/RISCV/trunc-nsw-nuw.ll
+++ b/llvm/test/CodeGen/RISCV/trunc-nsw-nuw.ll
@@ -16,12 +16,12 @@ define signext i32 @trunc_nuw_nsw_urem(i64 %x) nounwind {
 ; CHECK-LABEL: trunc_nuw_nsw_urem:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lui a1, 210
+; CHECK-NEXT:    lui a2, 2
 ; CHECK-NEXT:    addiw a1, a1, -1167
 ; CHECK-NEXT:    slli a1, a1, 12
 ; CHECK-NEXT:    addi a1, a1, 1881
 ; CHECK-NEXT:    mul a1, a0, a1
 ; CHECK-NEXT:    srli a1, a1, 45
-; CHECK-NEXT:    lui a2, 2
 ; CHECK-NEXT:    addi a2, a2, 1808
 ; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    subw a0, a0, a1

diff  --git a/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll b/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
index 23875a7ec56211..409114f8a96123 100644
--- a/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
@@ -105,28 +105,28 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; RV32I-LABEL: func16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a3, 16
-; RV32I-NEXT:    addi a3, a3, -1
-; RV32I-NEXT:    and a0, a0, a3
-; RV32I-NEXT:    mul a1, a1, a2
-; RV32I-NEXT:    and a1, a1, a3
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    bltu a0, a3, .LBB2_2
+; RV32I-NEXT:    mul a2, a1, a2
+; RV32I-NEXT:    addi a1, a3, -1
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    and a2, a2, a1
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    bltu a0, a1, .LBB2_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    mv a0, a3
+; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:  .LBB2_2:
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a3, 16
-; RV64I-NEXT:    addiw a3, a3, -1
-; RV64I-NEXT:    and a0, a0, a3
-; RV64I-NEXT:    mul a1, a1, a2
-; RV64I-NEXT:    and a1, a1, a3
-; RV64I-NEXT:    add a0, a0, a1
-; RV64I-NEXT:    bltu a0, a3, .LBB2_2
+; RV64I-NEXT:    mul a2, a1, a2
+; RV64I-NEXT:    addiw a1, a3, -1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    and a2, a2, a1
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    bltu a0, a1, .LBB2_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    mv a0, a3
+; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:  .LBB2_2:
 ; RV64I-NEXT:    ret
 ;
@@ -134,22 +134,22 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; RV32IZbb:       # %bb.0:
 ; RV32IZbb-NEXT:    zext.h a0, a0
 ; RV32IZbb-NEXT:    mul a1, a1, a2
+; RV32IZbb-NEXT:    lui a2, 16
 ; RV32IZbb-NEXT:    zext.h a1, a1
 ; RV32IZbb-NEXT:    add a0, a0, a1
-; RV32IZbb-NEXT:    lui a1, 16
-; RV32IZbb-NEXT:    addi a1, a1, -1
-; RV32IZbb-NEXT:    minu a0, a0, a1
+; RV32IZbb-NEXT:    addi a2, a2, -1
+; RV32IZbb-NEXT:    minu a0, a0, a2
 ; RV32IZbb-NEXT:    ret
 ;
 ; RV64IZbb-LABEL: func16:
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    zext.h a0, a0
 ; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    lui a2, 16
 ; RV64IZbb-NEXT:    zext.h a1, a1
 ; RV64IZbb-NEXT:    add a0, a0, a1
-; RV64IZbb-NEXT:    lui a1, 16
-; RV64IZbb-NEXT:    addiw a1, a1, -1
-; RV64IZbb-NEXT:    minu a0, a0, a1
+; RV64IZbb-NEXT:    addiw a2, a2, -1
+; RV64IZbb-NEXT:    minu a0, a0, a2
 ; RV64IZbb-NEXT:    ret
   %a = mul i16 %y, %z
   %tmp = call i16 @llvm.uadd.sat.i16(i16 %x, i16 %a)

diff  --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
index dde69667b8ec30..d6fd4f15c4e53b 100644
--- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
@@ -10,99 +10,101 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    sw s2, 20(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
-; RISCV32-NEXT:    lw a3, 0(a1)
-; RISCV32-NEXT:    lw t2, 4(a1)
-; RISCV32-NEXT:    lw a4, 8(a1)
-; RISCV32-NEXT:    lw a5, 12(a1)
-; RISCV32-NEXT:    lw a1, 0(a2)
-; RISCV32-NEXT:    lw t0, 4(a2)
-; RISCV32-NEXT:    lw a6, 8(a2)
-; RISCV32-NEXT:    lw a7, 12(a2)
-; RISCV32-NEXT:    mulhu a2, a3, a1
-; RISCV32-NEXT:    mul t1, t2, a1
-; RISCV32-NEXT:    add a2, t1, a2
-; RISCV32-NEXT:    sltu t1, a2, t1
-; RISCV32-NEXT:    mulhu t3, t2, a1
-; RISCV32-NEXT:    add t4, t3, t1
-; RISCV32-NEXT:    mul t1, a3, t0
-; RISCV32-NEXT:    add a2, t1, a2
-; RISCV32-NEXT:    sltu t1, a2, t1
-; RISCV32-NEXT:    mulhu t3, a3, t0
-; RISCV32-NEXT:    add t1, t3, t1
-; RISCV32-NEXT:    add t5, t4, t1
-; RISCV32-NEXT:    mul t6, t2, t0
-; RISCV32-NEXT:    add s0, t6, t5
-; RISCV32-NEXT:    mul t1, a6, a3
-; RISCV32-NEXT:    mul s3, a4, a1
-; RISCV32-NEXT:    add s4, s3, t1
-; RISCV32-NEXT:    add t1, s0, s4
-; RISCV32-NEXT:    sltu t3, t1, s0
-; RISCV32-NEXT:    sltu s0, s0, t6
-; RISCV32-NEXT:    sltu t4, t5, t4
-; RISCV32-NEXT:    mulhu t5, t2, t0
-; RISCV32-NEXT:    add t4, t5, t4
-; RISCV32-NEXT:    add s0, t4, s0
-; RISCV32-NEXT:    mul t4, t2, a6
-; RISCV32-NEXT:    mul t5, a7, a3
-; RISCV32-NEXT:    add t4, t5, t4
-; RISCV32-NEXT:    mulhu s1, a6, a3
-; RISCV32-NEXT:    add s2, s1, t4
-; RISCV32-NEXT:    mul t4, t0, a4
-; RISCV32-NEXT:    mul t5, a5, a1
-; RISCV32-NEXT:    add t4, t5, t4
-; RISCV32-NEXT:    mulhu t5, a4, a1
-; RISCV32-NEXT:    add t6, t5, t4
-; RISCV32-NEXT:    add t4, t6, s2
-; RISCV32-NEXT:    sltu s3, s4, s3
-; RISCV32-NEXT:    add t4, t4, s3
-; RISCV32-NEXT:    add t4, s0, t4
-; RISCV32-NEXT:    add t4, t4, t3
-; RISCV32-NEXT:    beq t4, s0, .LBB0_2
+; RISCV32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
+; RISCV32-NEXT:    lw a4, 0(a1)
+; RISCV32-NEXT:    lw t0, 4(a1)
+; RISCV32-NEXT:    lw a3, 8(a1)
+; RISCV32-NEXT:    lw a1, 12(a1)
+; RISCV32-NEXT:    lw a6, 0(a2)
+; RISCV32-NEXT:    lw a5, 4(a2)
+; RISCV32-NEXT:    lw a7, 8(a2)
+; RISCV32-NEXT:    lw a2, 12(a2)
+; RISCV32-NEXT:    mulhu t1, a4, a6
+; RISCV32-NEXT:    mul t2, t0, a6
+; RISCV32-NEXT:    mulhu t3, t0, a6
+; RISCV32-NEXT:    mul t4, a4, a5
+; RISCV32-NEXT:    mulhu t5, a4, a5
+; RISCV32-NEXT:    mul s2, t0, a5
+; RISCV32-NEXT:    mul t6, a7, a4
+; RISCV32-NEXT:    mul s3, a3, a6
+; RISCV32-NEXT:    mul s0, t0, a7
+; RISCV32-NEXT:    mul s1, a2, a4
+; RISCV32-NEXT:    mul s4, a5, a3
+; RISCV32-NEXT:    add s1, s1, s0
+; RISCV32-NEXT:    mul s0, a1, a6
+; RISCV32-NEXT:    add s4, s0, s4
+; RISCV32-NEXT:    mulhu s5, t0, a5
+; RISCV32-NEXT:    add t1, t2, t1
+; RISCV32-NEXT:    sltu t2, t1, t2
+; RISCV32-NEXT:    add t2, t3, t2
+; RISCV32-NEXT:    mulhu s0, a7, a4
+; RISCV32-NEXT:    add t1, t4, t1
+; RISCV32-NEXT:    sltu t3, t1, t4
+; RISCV32-NEXT:    add t3, t5, t3
+; RISCV32-NEXT:    mulhu t5, a3, a6
+; RISCV32-NEXT:    add t4, s3, t6
+; RISCV32-NEXT:    add s1, s0, s1
+; RISCV32-NEXT:    add t6, t5, s4
+; RISCV32-NEXT:    sltu s3, t4, s3
+; RISCV32-NEXT:    add t3, t2, t3
+; RISCV32-NEXT:    sltu t2, t3, t2
+; RISCV32-NEXT:    add s5, s5, t2
+; RISCV32-NEXT:    add s4, t6, s1
+; RISCV32-NEXT:    add t3, s2, t3
+; RISCV32-NEXT:    add t2, t3, t4
+; RISCV32-NEXT:    sltu s2, t3, s2
+; RISCV32-NEXT:    sltu t4, t2, t3
+; RISCV32-NEXT:    add s2, s5, s2
+; RISCV32-NEXT:    add s3, s4, s3
+; RISCV32-NEXT:    add t3, s2, s3
+; RISCV32-NEXT:    add t3, t3, t4
+; RISCV32-NEXT:    beq t3, s2, .LBB0_2
 ; RISCV32-NEXT:  # %bb.1: # %start
-; RISCV32-NEXT:    sltu t3, t4, s0
+; RISCV32-NEXT:    sltu t4, t3, s2
 ; RISCV32-NEXT:  .LBB0_2: # %start
-; RISCV32-NEXT:    sltu s0, s2, s1
-; RISCV32-NEXT:    snez s1, t2
-; RISCV32-NEXT:    snez s2, a7
-; RISCV32-NEXT:    and s1, s2, s1
-; RISCV32-NEXT:    mulhu s2, a7, a3
-; RISCV32-NEXT:    snez s2, s2
-; RISCV32-NEXT:    or s1, s1, s2
-; RISCV32-NEXT:    mulhu t2, t2, a6
-; RISCV32-NEXT:    snez t2, t2
-; RISCV32-NEXT:    or t2, s1, t2
-; RISCV32-NEXT:    or t2, t2, s0
+; RISCV32-NEXT:    sltu s0, s1, s0
+; RISCV32-NEXT:    snez s1, t0
+; RISCV32-NEXT:    snez s2, a2
 ; RISCV32-NEXT:    sltu t5, t6, t5
-; RISCV32-NEXT:    snez t6, t0
-; RISCV32-NEXT:    snez s0, a5
-; RISCV32-NEXT:    and t6, s0, t6
-; RISCV32-NEXT:    mulhu s0, a5, a1
-; RISCV32-NEXT:    snez s0, s0
-; RISCV32-NEXT:    or t6, t6, s0
-; RISCV32-NEXT:    mulhu t0, t0, a4
+; RISCV32-NEXT:    mulhu t6, a2, a4
+; RISCV32-NEXT:    mulhu t0, t0, a7
+; RISCV32-NEXT:    or a2, a7, a2
+; RISCV32-NEXT:    snez a7, a5
+; RISCV32-NEXT:    mul a4, a4, a6
+; RISCV32-NEXT:    mulhu a6, a1, a6
+; RISCV32-NEXT:    mulhu a5, a5, a3
+; RISCV32-NEXT:    or a3, a3, a1
+; RISCV32-NEXT:    snez a1, a1
+; RISCV32-NEXT:    and s1, s2, s1
+; RISCV32-NEXT:    snez t6, t6
 ; RISCV32-NEXT:    snez t0, t0
-; RISCV32-NEXT:    or t0, t6, t0
-; RISCV32-NEXT:    or t0, t0, t5
-; RISCV32-NEXT:    or a6, a6, a7
+; RISCV32-NEXT:    and a1, a1, a7
 ; RISCV32-NEXT:    snez a6, a6
-; RISCV32-NEXT:    or a4, a4, a5
-; RISCV32-NEXT:    snez a4, a4
-; RISCV32-NEXT:    and a4, a4, a6
-; RISCV32-NEXT:    or a4, a4, t0
-; RISCV32-NEXT:    or a4, a4, t2
-; RISCV32-NEXT:    or a4, a4, t3
-; RISCV32-NEXT:    mul a1, a3, a1
-; RISCV32-NEXT:    andi a4, a4, 1
-; RISCV32-NEXT:    sw a1, 0(a0)
-; RISCV32-NEXT:    sw a2, 4(a0)
-; RISCV32-NEXT:    sw t1, 8(a0)
-; RISCV32-NEXT:    sw t4, 12(a0)
-; RISCV32-NEXT:    sb a4, 16(a0)
+; RISCV32-NEXT:    snez a5, a5
+; RISCV32-NEXT:    snez a2, a2
+; RISCV32-NEXT:    snez a3, a3
+; RISCV32-NEXT:    or a7, s1, t6
+; RISCV32-NEXT:    or a1, a1, a6
+; RISCV32-NEXT:    and a2, a3, a2
+; RISCV32-NEXT:    or a3, a7, t0
+; RISCV32-NEXT:    or a1, a1, a5
+; RISCV32-NEXT:    or a3, a3, s0
+; RISCV32-NEXT:    or a1, a1, t5
+; RISCV32-NEXT:    or a1, a2, a1
+; RISCV32-NEXT:    or a1, a1, a3
+; RISCV32-NEXT:    or a1, a1, t4
+; RISCV32-NEXT:    andi a1, a1, 1
+; RISCV32-NEXT:    sw a4, 0(a0)
+; RISCV32-NEXT:    sw t1, 4(a0)
+; RISCV32-NEXT:    sw t2, 8(a0)
+; RISCV32-NEXT:    sw t3, 12(a0)
+; RISCV32-NEXT:    sb a1, 16(a0)
 ; RISCV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
+; RISCV32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    addi sp, sp, 32
 ; RISCV32-NEXT:    ret
 start:

diff  --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
index a2f5e446b63bcd..1cdfaa5c4154be 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
@@ -133,50 +133,49 @@ define i64 @load_i64(ptr %p) {
 ; RV32I-LABEL: load_i64:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lbu a1, 1(a0)
-; RV32I-NEXT:    lbu a2, 0(a0)
-; RV32I-NEXT:    lbu a3, 2(a0)
-; RV32I-NEXT:    lbu a4, 3(a0)
+; RV32I-NEXT:    lbu a2, 2(a0)
+; RV32I-NEXT:    lbu a3, 3(a0)
+; RV32I-NEXT:    lbu a4, 0(a0)
 ; RV32I-NEXT:    slli a1, a1, 8
-; RV32I-NEXT:    or a1, a1, a2
-; RV32I-NEXT:    slli a3, a3, 16
-; RV32I-NEXT:    slli a4, a4, 24
-; RV32I-NEXT:    or a2, a4, a3
-; RV32I-NEXT:    lbu a3, 4(a0)
-; RV32I-NEXT:    lbu a4, 5(a0)
-; RV32I-NEXT:    or a2, a2, a1
-; RV32I-NEXT:    lbu a1, 6(a0)
+; RV32I-NEXT:    slli a2, a2, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    lbu a4, 4(a0)
+; RV32I-NEXT:    lbu a5, 5(a0)
+; RV32I-NEXT:    or a2, a3, a2
+; RV32I-NEXT:    lbu a3, 6(a0)
 ; RV32I-NEXT:    lbu a0, 7(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    slli a1, a1, 16
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    slli a3, a3, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    or a1, a0, a3
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    or a3, a0, a3
+; RV32I-NEXT:    or a0, a2, a1
+; RV32I-NEXT:    or a1, a3, a4
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: load_i64:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lbu a1, 1(a0)
-; RV64I-NEXT:    lbu a2, 0(a0)
-; RV64I-NEXT:    lbu a3, 2(a0)
-; RV64I-NEXT:    lbu a4, 3(a0)
+; RV64I-NEXT:    lbu a2, 2(a0)
+; RV64I-NEXT:    lbu a3, 3(a0)
+; RV64I-NEXT:    lbu a4, 0(a0)
 ; RV64I-NEXT:    slli a1, a1, 8
-; RV64I-NEXT:    or a1, a1, a2
-; RV64I-NEXT:    slli a3, a3, 16
-; RV64I-NEXT:    slli a4, a4, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a2, 4(a0)
-; RV64I-NEXT:    lbu a4, 5(a0)
-; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    slli a2, a2, 16
+; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    or a1, a1, a4
+; RV64I-NEXT:    lbu a4, 4(a0)
+; RV64I-NEXT:    lbu a5, 5(a0)
+; RV64I-NEXT:    or a2, a3, a2
 ; RV64I-NEXT:    lbu a3, 6(a0)
 ; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a2, a4, a2
+; RV64I-NEXT:    slli a5, a5, 8
+; RV64I-NEXT:    or a4, a5, a4
 ; RV64I-NEXT:    slli a3, a3, 16
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -187,16 +186,16 @@ define i64 @load_i64(ptr %p) {
 ; RV32IZBKB-NEXT:    lbu a2, 1(a0)
 ; RV32IZBKB-NEXT:    lbu a3, 2(a0)
 ; RV32IZBKB-NEXT:    lbu a4, 3(a0)
-; RV32IZBKB-NEXT:    lbu a5, 4(a0)
-; RV32IZBKB-NEXT:    lbu a6, 5(a0)
-; RV32IZBKB-NEXT:    lbu a7, 6(a0)
-; RV32IZBKB-NEXT:    lbu t0, 7(a0)
-; RV32IZBKB-NEXT:    packh a0, a3, a4
+; RV32IZBKB-NEXT:    lbu a5, 5(a0)
+; RV32IZBKB-NEXT:    lbu a6, 6(a0)
+; RV32IZBKB-NEXT:    lbu a7, 7(a0)
+; RV32IZBKB-NEXT:    lbu a0, 4(a0)
+; RV32IZBKB-NEXT:    packh a3, a3, a4
 ; RV32IZBKB-NEXT:    packh a1, a1, a2
-; RV32IZBKB-NEXT:    pack a0, a1, a0
-; RV32IZBKB-NEXT:    packh a1, a7, t0
-; RV32IZBKB-NEXT:    packh a2, a5, a6
-; RV32IZBKB-NEXT:    pack a1, a2, a1
+; RV32IZBKB-NEXT:    packh a2, a6, a7
+; RV32IZBKB-NEXT:    packh a4, a0, a5
+; RV32IZBKB-NEXT:    pack a0, a1, a3
+; RV32IZBKB-NEXT:    pack a1, a4, a2
 ; RV32IZBKB-NEXT:    ret
 ;
 ; RV64IZBKB-LABEL: load_i64:
@@ -205,18 +204,18 @@ define i64 @load_i64(ptr %p) {
 ; RV64IZBKB-NEXT:    lbu a2, 5(a0)
 ; RV64IZBKB-NEXT:    lbu a3, 6(a0)
 ; RV64IZBKB-NEXT:    lbu a4, 7(a0)
+; RV64IZBKB-NEXT:    lbu a5, 0(a0)
+; RV64IZBKB-NEXT:    lbu a6, 1(a0)
+; RV64IZBKB-NEXT:    lbu a7, 2(a0)
+; RV64IZBKB-NEXT:    lbu a0, 3(a0)
 ; RV64IZBKB-NEXT:    packh a1, a1, a2
 ; RV64IZBKB-NEXT:    packh a2, a3, a4
-; RV64IZBKB-NEXT:    lbu a3, 0(a0)
-; RV64IZBKB-NEXT:    lbu a4, 1(a0)
-; RV64IZBKB-NEXT:    lbu a5, 2(a0)
-; RV64IZBKB-NEXT:    lbu a0, 3(a0)
+; RV64IZBKB-NEXT:    packh a3, a5, a6
+; RV64IZBKB-NEXT:    packh a0, a7, a0
 ; RV64IZBKB-NEXT:    slli a2, a2, 16
-; RV64IZBKB-NEXT:    or a1, a2, a1
-; RV64IZBKB-NEXT:    packh a2, a3, a4
-; RV64IZBKB-NEXT:    packh a0, a5, a0
 ; RV64IZBKB-NEXT:    slli a0, a0, 16
-; RV64IZBKB-NEXT:    or a0, a0, a2
+; RV64IZBKB-NEXT:    or a1, a2, a1
+; RV64IZBKB-NEXT:    or a0, a0, a3
 ; RV64IZBKB-NEXT:    pack a0, a0, a1
 ; RV64IZBKB-NEXT:    ret
 ;
@@ -306,17 +305,17 @@ define void @store_i64(ptr %p, i64 %v) {
 ; RV32I-NEXT:    srli a3, a2, 24
 ; RV32I-NEXT:    srli a4, a2, 16
 ; RV32I-NEXT:    srli a5, a2, 8
+; RV32I-NEXT:    srli a6, a1, 24
+; RV32I-NEXT:    srli a7, a1, 16
 ; RV32I-NEXT:    sb a2, 4(a0)
 ; RV32I-NEXT:    sb a5, 5(a0)
 ; RV32I-NEXT:    sb a4, 6(a0)
 ; RV32I-NEXT:    sb a3, 7(a0)
-; RV32I-NEXT:    srli a2, a1, 24
-; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    srli a4, a1, 8
+; RV32I-NEXT:    srli a2, a1, 8
 ; RV32I-NEXT:    sb a1, 0(a0)
-; RV32I-NEXT:    sb a4, 1(a0)
-; RV32I-NEXT:    sb a3, 2(a0)
-; RV32I-NEXT:    sb a2, 3(a0)
+; RV32I-NEXT:    sb a2, 1(a0)
+; RV32I-NEXT:    sb a7, 2(a0)
+; RV32I-NEXT:    sb a6, 3(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: store_i64:
@@ -325,17 +324,17 @@ define void @store_i64(ptr %p, i64 %v) {
 ; RV64I-NEXT:    srli a3, a1, 48
 ; RV64I-NEXT:    srli a4, a1, 40
 ; RV64I-NEXT:    srli a5, a1, 32
+; RV64I-NEXT:    srli a6, a1, 24
+; RV64I-NEXT:    srli a7, a1, 16
 ; RV64I-NEXT:    sb a5, 4(a0)
 ; RV64I-NEXT:    sb a4, 5(a0)
 ; RV64I-NEXT:    sb a3, 6(a0)
 ; RV64I-NEXT:    sb a2, 7(a0)
-; RV64I-NEXT:    srli a2, a1, 24
-; RV64I-NEXT:    srli a3, a1, 16
-; RV64I-NEXT:    srli a4, a1, 8
+; RV64I-NEXT:    srli a2, a1, 8
 ; RV64I-NEXT:    sb a1, 0(a0)
-; RV64I-NEXT:    sb a4, 1(a0)
-; RV64I-NEXT:    sb a3, 2(a0)
-; RV64I-NEXT:    sb a2, 3(a0)
+; RV64I-NEXT:    sb a2, 1(a0)
+; RV64I-NEXT:    sb a7, 2(a0)
+; RV64I-NEXT:    sb a6, 3(a0)
 ; RV64I-NEXT:    ret
 ;
 ; RV32IZBKB-LABEL: store_i64:
@@ -343,17 +342,17 @@ define void @store_i64(ptr %p, i64 %v) {
 ; RV32IZBKB-NEXT:    srli a3, a2, 24
 ; RV32IZBKB-NEXT:    srli a4, a2, 16
 ; RV32IZBKB-NEXT:    srli a5, a2, 8
+; RV32IZBKB-NEXT:    srli a6, a1, 24
+; RV32IZBKB-NEXT:    srli a7, a1, 16
 ; RV32IZBKB-NEXT:    sb a2, 4(a0)
 ; RV32IZBKB-NEXT:    sb a5, 5(a0)
 ; RV32IZBKB-NEXT:    sb a4, 6(a0)
 ; RV32IZBKB-NEXT:    sb a3, 7(a0)
-; RV32IZBKB-NEXT:    srli a2, a1, 24
-; RV32IZBKB-NEXT:    srli a3, a1, 16
-; RV32IZBKB-NEXT:    srli a4, a1, 8
+; RV32IZBKB-NEXT:    srli a2, a1, 8
 ; RV32IZBKB-NEXT:    sb a1, 0(a0)
-; RV32IZBKB-NEXT:    sb a4, 1(a0)
-; RV32IZBKB-NEXT:    sb a3, 2(a0)
-; RV32IZBKB-NEXT:    sb a2, 3(a0)
+; RV32IZBKB-NEXT:    sb a2, 1(a0)
+; RV32IZBKB-NEXT:    sb a7, 2(a0)
+; RV32IZBKB-NEXT:    sb a6, 3(a0)
 ; RV32IZBKB-NEXT:    ret
 ;
 ; RV64IZBKB-LABEL: store_i64:
@@ -362,17 +361,17 @@ define void @store_i64(ptr %p, i64 %v) {
 ; RV64IZBKB-NEXT:    srli a3, a1, 48
 ; RV64IZBKB-NEXT:    srli a4, a1, 40
 ; RV64IZBKB-NEXT:    srli a5, a1, 32
+; RV64IZBKB-NEXT:    srli a6, a1, 24
+; RV64IZBKB-NEXT:    srli a7, a1, 16
 ; RV64IZBKB-NEXT:    sb a5, 4(a0)
 ; RV64IZBKB-NEXT:    sb a4, 5(a0)
 ; RV64IZBKB-NEXT:    sb a3, 6(a0)
 ; RV64IZBKB-NEXT:    sb a2, 7(a0)
-; RV64IZBKB-NEXT:    srli a2, a1, 24
-; RV64IZBKB-NEXT:    srli a3, a1, 16
-; RV64IZBKB-NEXT:    srli a4, a1, 8
+; RV64IZBKB-NEXT:    srli a2, a1, 8
 ; RV64IZBKB-NEXT:    sb a1, 0(a0)
-; RV64IZBKB-NEXT:    sb a4, 1(a0)
-; RV64IZBKB-NEXT:    sb a3, 2(a0)
-; RV64IZBKB-NEXT:    sb a2, 3(a0)
+; RV64IZBKB-NEXT:    sb a2, 1(a0)
+; RV64IZBKB-NEXT:    sb a7, 2(a0)
+; RV64IZBKB-NEXT:    sb a6, 3(a0)
 ; RV64IZBKB-NEXT:    ret
 ;
 ; RV32I-FAST-LABEL: store_i64:
@@ -546,25 +545,25 @@ define void @store_large_constant(ptr %x) {
 ; SLOW-NEXT:    li a2, 220
 ; SLOW-NEXT:    li a3, 186
 ; SLOW-NEXT:    li a4, 152
+; SLOW-NEXT:    li a5, 118
+; SLOW-NEXT:    li a6, 84
+; SLOW-NEXT:    li a7, 50
 ; SLOW-NEXT:    sb a4, 4(a0)
 ; SLOW-NEXT:    sb a3, 5(a0)
 ; SLOW-NEXT:    sb a2, 6(a0)
 ; SLOW-NEXT:    sb a1, 7(a0)
-; SLOW-NEXT:    li a1, 118
-; SLOW-NEXT:    li a2, 84
-; SLOW-NEXT:    li a3, 50
-; SLOW-NEXT:    li a4, 16
-; SLOW-NEXT:    sb a4, 0(a0)
-; SLOW-NEXT:    sb a3, 1(a0)
-; SLOW-NEXT:    sb a2, 2(a0)
-; SLOW-NEXT:    sb a1, 3(a0)
+; SLOW-NEXT:    li a1, 16
+; SLOW-NEXT:    sb a1, 0(a0)
+; SLOW-NEXT:    sb a7, 1(a0)
+; SLOW-NEXT:    sb a6, 2(a0)
+; SLOW-NEXT:    sb a5, 3(a0)
 ; SLOW-NEXT:    ret
 ;
 ; RV32I-FAST-LABEL: store_large_constant:
 ; RV32I-FAST:       # %bb.0:
 ; RV32I-FAST-NEXT:    lui a1, 1043916
-; RV32I-FAST-NEXT:    addi a1, a1, -1384
 ; RV32I-FAST-NEXT:    lui a2, 484675
+; RV32I-FAST-NEXT:    addi a1, a1, -1384
 ; RV32I-FAST-NEXT:    addi a2, a2, 528
 ; RV32I-FAST-NEXT:    sw a2, 0(a0)
 ; RV32I-FAST-NEXT:    sw a1, 4(a0)

diff  --git a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
index 22c0b798e1468c..1517e524a7f78d 100644
--- a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
@@ -208,10 +208,10 @@ define i64 @in64(i64 %x, i64 %y, i64 %mask) {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    andn a2, a2, a4
 ; RV32ZBB-NEXT:    and a0, a0, a4
-; RV32ZBB-NEXT:    or a0, a0, a2
-; RV32ZBB-NEXT:    andn a2, a3, a5
+; RV32ZBB-NEXT:    andn a3, a3, a5
 ; RV32ZBB-NEXT:    and a1, a1, a5
-; RV32ZBB-NEXT:    or a1, a1, a2
+; RV32ZBB-NEXT:    or a0, a0, a2
+; RV32ZBB-NEXT:    or a1, a1, a3
 ; RV32ZBB-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: in64:

diff  --git a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
index 65307363048376..602df6831452cb 100644
--- a/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
+++ b/llvm/test/CodeGen/RISCV/unroll-loop-cse.ll
@@ -11,17 +11,17 @@ define signext i32 @unroll_loop_cse() {
 ; CHECK-LABEL: unroll_loop_cse:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    lui a0, %hi(x)
-; CHECK-NEXT:    lw a1, %lo(x)(a0)
-; CHECK-NEXT:    lui a0, %hi(check)
-; CHECK-NEXT:    lw a2, %lo(check)(a0)
+; CHECK-NEXT:    lui a1, %hi(check)
+; CHECK-NEXT:    lw a2, %lo(x)(a0)
+; CHECK-NEXT:    lw a1, %lo(check)(a1)
 ; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    bne a1, a2, .LBB0_6
+; CHECK-NEXT:    bne a2, a1, .LBB0_6
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a1, %hi(x)
 ; CHECK-NEXT:    addi a1, a1, %lo(x)
-; CHECK-NEXT:    lw a3, 4(a1)
 ; CHECK-NEXT:    lui a2, %hi(check)
 ; CHECK-NEXT:    addi a2, a2, %lo(check)
+; CHECK-NEXT:    lw a3, 4(a1)
 ; CHECK-NEXT:    lw a4, 4(a2)
 ; CHECK-NEXT:    bne a3, a4, .LBB0_6
 ; CHECK-NEXT:  # %bb.2:

diff  --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll
index f83a933c0b5c82..af5121dfe180d1 100644
--- a/llvm/test/CodeGen/RISCV/urem-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll
@@ -137,8 +137,8 @@ define i32 @combine_urem_udiv(i32 %x) nounwind {
 ; RV32IM-NEXT:    sub a2, a0, a1
 ; RV32IM-NEXT:    srli a2, a2, 1
 ; RV32IM-NEXT:    add a1, a2, a1
-; RV32IM-NEXT:    srli a1, a1, 6
 ; RV32IM-NEXT:    li a2, 95
+; RV32IM-NEXT:    srli a1, a1, 6
 ; RV32IM-NEXT:    mul a2, a1, a2
 ; RV32IM-NEXT:    add a0, a0, a1
 ; RV32IM-NEXT:    sub a0, a0, a2
@@ -177,8 +177,8 @@ define i32 @combine_urem_udiv(i32 %x) nounwind {
 ; RV64IM-NEXT:    subw a2, a0, a1
 ; RV64IM-NEXT:    srliw a2, a2, 1
 ; RV64IM-NEXT:    add a1, a2, a1
-; RV64IM-NEXT:    srli a1, a1, 6
 ; RV64IM-NEXT:    li a2, 95
+; RV64IM-NEXT:    srli a1, a1, 6
 ; RV64IM-NEXT:    mul a2, a1, a2
 ; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    subw a0, a0, a2

diff  --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
index b887036372f7b2..c73a18c8869d55 100644
--- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll
@@ -91,9 +91,9 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    srli a0, a0, 6
 ; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    lui a1, 2341
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    srli a0, a0, 5
-; RV32-NEXT:    lui a1, 2341
 ; RV32-NEXT:    addi a1, a1, -1755
 ; RV32-NEXT:    sltu a0, a0, a1
 ; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -111,9 +111,9 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; RV64-NEXT:    slli a0, a0, 37
 ; RV64-NEXT:    srli a0, a0, 38
 ; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    lui a1, 2341
 ; RV64-NEXT:    slli a0, a0, 37
 ; RV64-NEXT:    srli a0, a0, 37
-; RV64-NEXT:    lui a1, 2341
 ; RV64-NEXT:    addiw a1, a1, -1755
 ; RV64-NEXT:    sltu a0, a0, a1
 ; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -129,9 +129,9 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; RV32M-NEXT:    slli a0, a0, 5
 ; RV32M-NEXT:    srli a0, a0, 6
 ; RV32M-NEXT:    or a0, a0, a1
+; RV32M-NEXT:    lui a1, 2341
 ; RV32M-NEXT:    slli a0, a0, 5
 ; RV32M-NEXT:    srli a0, a0, 5
-; RV32M-NEXT:    lui a1, 2341
 ; RV32M-NEXT:    addi a1, a1, -1755
 ; RV32M-NEXT:    sltu a0, a0, a1
 ; RV32M-NEXT:    ret
@@ -145,9 +145,9 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; RV64M-NEXT:    slli a0, a0, 37
 ; RV64M-NEXT:    srli a0, a0, 38
 ; RV64M-NEXT:    or a0, a0, a1
+; RV64M-NEXT:    lui a1, 2341
 ; RV64M-NEXT:    slli a0, a0, 37
 ; RV64M-NEXT:    srli a0, a0, 37
-; RV64M-NEXT:    lui a1, 2341
 ; RV64M-NEXT:    addiw a1, a1, -1755
 ; RV64M-NEXT:    sltu a0, a0, a1
 ; RV64M-NEXT:    ret
@@ -161,9 +161,9 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; RV32MV-NEXT:    slli a0, a0, 5
 ; RV32MV-NEXT:    srli a0, a0, 6
 ; RV32MV-NEXT:    or a0, a0, a1
+; RV32MV-NEXT:    lui a1, 2341
 ; RV32MV-NEXT:    slli a0, a0, 5
 ; RV32MV-NEXT:    srli a0, a0, 5
-; RV32MV-NEXT:    lui a1, 2341
 ; RV32MV-NEXT:    addi a1, a1, -1755
 ; RV32MV-NEXT:    sltu a0, a0, a1
 ; RV32MV-NEXT:    ret
@@ -177,9 +177,9 @@ define i1 @test_urem_even(i27 %X) nounwind {
 ; RV64MV-NEXT:    slli a0, a0, 37
 ; RV64MV-NEXT:    srli a0, a0, 38
 ; RV64MV-NEXT:    or a0, a0, a1
+; RV64MV-NEXT:    lui a1, 2341
 ; RV64MV-NEXT:    slli a0, a0, 37
 ; RV64MV-NEXT:    srli a0, a0, 37
-; RV64MV-NEXT:    lui a1, 2341
 ; RV64MV-NEXT:    addiw a1, a1, -1755
 ; RV64MV-NEXT:    sltu a0, a0, a1
 ; RV64MV-NEXT:    ret
@@ -357,15 +357,15 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV32-NEXT:    mv a0, s2
 ; RV32-NEXT:    call __mulsi3
 ; RV32-NEXT:    addi a0, a0, -1463
-; RV32-NEXT:    andi a0, a0, 2047
-; RV32-NEXT:    sltiu a0, a0, 293
 ; RV32-NEXT:    addi s3, s3, -1
 ; RV32-NEXT:    addi s1, s1, -1
-; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    andi a0, a0, 2047
 ; RV32-NEXT:    andi a1, s3, 2047
+; RV32-NEXT:    slli s1, s1, 22
+; RV32-NEXT:    sltiu a0, a0, 293
+; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    andi a0, a0, 2047
 ; RV32-NEXT:    slli a0, a0, 11
-; RV32-NEXT:    slli s1, s1, 22
 ; RV32-NEXT:    or a0, a0, s1
 ; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    sw a0, 0(s0)
@@ -413,14 +413,14 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV64-NEXT:    mv a0, s1
 ; RV64-NEXT:    call __muldi3
 ; RV64-NEXT:    addi a0, a0, -1638
-; RV64-NEXT:    andi a0, a0, 2047
-; RV64-NEXT:    sltiu a0, a0, 2
 ; RV64-NEXT:    addi s3, s3, -1
-; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    addi s2, s2, -1
+; RV64-NEXT:    andi a0, a0, 2047
 ; RV64-NEXT:    andi a1, s3, 2047
 ; RV64-NEXT:    andi a2, s2, 2047
+; RV64-NEXT:    sltiu a0, a0, 2
 ; RV64-NEXT:    slli a2, a2, 11
+; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    slli a0, a0, 22
 ; RV64-NEXT:    or a0, a2, a0
 ; RV64-NEXT:    or a0, a1, a0
@@ -440,39 +440,39 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV32M:       # %bb.0:
 ; RV32M-NEXT:    lbu a1, 4(a0)
 ; RV32M-NEXT:    lw a2, 0(a0)
-; RV32M-NEXT:    slli a1, a1, 10
-; RV32M-NEXT:    srli a3, a2, 22
-; RV32M-NEXT:    or a1, a3, a1
-; RV32M-NEXT:    srli a3, a2, 11
-; RV32M-NEXT:    andi a2, a2, 2047
-; RV32M-NEXT:    li a4, 683
-; RV32M-NEXT:    mul a2, a2, a4
-; RV32M-NEXT:    slli a4, a2, 10
-; RV32M-NEXT:    slli a2, a2, 21
-; RV32M-NEXT:    srli a2, a2, 22
-; RV32M-NEXT:    or a2, a2, a4
-; RV32M-NEXT:    andi a2, a2, 2047
-; RV32M-NEXT:    sltiu a2, a2, 342
+; RV32M-NEXT:    li a3, 683
 ; RV32M-NEXT:    li a4, 819
+; RV32M-NEXT:    slli a1, a1, 10
+; RV32M-NEXT:    srli a5, a2, 22
+; RV32M-NEXT:    or a1, a5, a1
+; RV32M-NEXT:    andi a5, a2, 2047
+; RV32M-NEXT:    mul a3, a5, a3
+; RV32M-NEXT:    li a5, 1463
+; RV32M-NEXT:    srli a2, a2, 11
+; RV32M-NEXT:    mul a2, a2, a5
+; RV32M-NEXT:    slli a5, a3, 10
+; RV32M-NEXT:    slli a3, a3, 21
 ; RV32M-NEXT:    mul a1, a1, a4
+; RV32M-NEXT:    addi a2, a2, -1463
+; RV32M-NEXT:    srli a3, a3, 22
 ; RV32M-NEXT:    addi a1, a1, -1638
+; RV32M-NEXT:    andi a2, a2, 2047
+; RV32M-NEXT:    or a3, a3, a5
 ; RV32M-NEXT:    andi a1, a1, 2047
-; RV32M-NEXT:    sltiu a1, a1, 2
-; RV32M-NEXT:    xori a4, a1, 1
-; RV32M-NEXT:    li a5, 1463
-; RV32M-NEXT:    mul a3, a3, a5
-; RV32M-NEXT:    addi a3, a3, -1463
+; RV32M-NEXT:    sltiu a2, a2, 293
 ; RV32M-NEXT:    andi a3, a3, 2047
-; RV32M-NEXT:    sltiu a3, a3, 293
+; RV32M-NEXT:    sltiu a1, a1, 2
 ; RV32M-NEXT:    addi a2, a2, -1
+; RV32M-NEXT:    sltiu a3, a3, 342
+; RV32M-NEXT:    xori a4, a1, 1
 ; RV32M-NEXT:    addi a1, a1, -1
-; RV32M-NEXT:    addi a3, a3, -1
 ; RV32M-NEXT:    andi a2, a2, 2047
-; RV32M-NEXT:    andi a3, a3, 2047
-; RV32M-NEXT:    slli a3, a3, 11
+; RV32M-NEXT:    addi a3, a3, -1
+; RV32M-NEXT:    slli a2, a2, 11
 ; RV32M-NEXT:    slli a1, a1, 22
-; RV32M-NEXT:    or a1, a3, a1
+; RV32M-NEXT:    andi a3, a3, 2047
 ; RV32M-NEXT:    or a1, a2, a1
+; RV32M-NEXT:    or a1, a3, a1
 ; RV32M-NEXT:    sw a1, 0(a0)
 ; RV32M-NEXT:    sb a4, 4(a0)
 ; RV32M-NEXT:    ret
@@ -481,38 +481,38 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV64M:       # %bb.0:
 ; RV64M-NEXT:    lbu a1, 4(a0)
 ; RV64M-NEXT:    lwu a2, 0(a0)
+; RV64M-NEXT:    li a3, 683
+; RV64M-NEXT:    li a4, 1463
 ; RV64M-NEXT:    slli a1, a1, 32
 ; RV64M-NEXT:    or a1, a2, a1
-; RV64M-NEXT:    srli a2, a1, 22
+; RV64M-NEXT:    andi a2, a1, 2047
+; RV64M-NEXT:    mul a2, a2, a3
 ; RV64M-NEXT:    srli a3, a1, 11
-; RV64M-NEXT:    andi a1, a1, 2047
-; RV64M-NEXT:    li a4, 683
-; RV64M-NEXT:    mul a1, a1, a4
-; RV64M-NEXT:    slli a4, a1, 10
-; RV64M-NEXT:    slli a1, a1, 53
-; RV64M-NEXT:    srli a1, a1, 54
-; RV64M-NEXT:    or a1, a1, a4
-; RV64M-NEXT:    andi a1, a1, 2047
-; RV64M-NEXT:    sltiu a1, a1, 342
-; RV64M-NEXT:    li a4, 1463
 ; RV64M-NEXT:    mul a3, a3, a4
+; RV64M-NEXT:    li a4, 819
+; RV64M-NEXT:    srli a1, a1, 22
+; RV64M-NEXT:    mul a1, a1, a4
+; RV64M-NEXT:    slli a4, a2, 10
+; RV64M-NEXT:    slli a2, a2, 53
 ; RV64M-NEXT:    addi a3, a3, -1463
+; RV64M-NEXT:    addi a1, a1, -1638
+; RV64M-NEXT:    srli a2, a2, 54
 ; RV64M-NEXT:    andi a3, a3, 2047
+; RV64M-NEXT:    andi a1, a1, 2047
+; RV64M-NEXT:    or a2, a2, a4
 ; RV64M-NEXT:    sltiu a3, a3, 293
-; RV64M-NEXT:    li a4, 819
-; RV64M-NEXT:    mul a2, a2, a4
-; RV64M-NEXT:    addi a2, a2, -1638
+; RV64M-NEXT:    sltiu a1, a1, 2
 ; RV64M-NEXT:    andi a2, a2, 2047
-; RV64M-NEXT:    sltiu a2, a2, 2
 ; RV64M-NEXT:    addi a1, a1, -1
-; RV64M-NEXT:    addi a2, a2, -1
 ; RV64M-NEXT:    addi a3, a3, -1
-; RV64M-NEXT:    andi a1, a1, 2047
+; RV64M-NEXT:    sltiu a2, a2, 342
 ; RV64M-NEXT:    andi a3, a3, 2047
+; RV64M-NEXT:    slli a1, a1, 22
+; RV64M-NEXT:    addi a2, a2, -1
 ; RV64M-NEXT:    slli a3, a3, 11
-; RV64M-NEXT:    slli a2, a2, 22
-; RV64M-NEXT:    or a2, a3, a2
-; RV64M-NEXT:    or a1, a1, a2
+; RV64M-NEXT:    andi a2, a2, 2047
+; RV64M-NEXT:    or a1, a3, a1
+; RV64M-NEXT:    or a1, a2, a1
 ; RV64M-NEXT:    slli a2, a1, 31
 ; RV64M-NEXT:    srli a2, a2, 63
 ; RV64M-NEXT:    sw a1, 0(a0)
@@ -523,58 +523,58 @@ define void @test_urem_vec(ptr %X) nounwind {
 ; RV32MV:       # %bb.0:
 ; RV32MV-NEXT:    lw a1, 0(a0)
 ; RV32MV-NEXT:    lbu a2, 4(a0)
-; RV32MV-NEXT:    andi a3, a1, 2047
 ; RV32MV-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; RV32MV-NEXT:    vmv.v.x v8, a3
-; RV32MV-NEXT:    slli a3, a1, 10
-; RV32MV-NEXT:    srli a3, a3, 21
-; RV32MV-NEXT:    vslide1down.vx v8, v8, a3
+; RV32MV-NEXT:    vid.v v8
+; RV32MV-NEXT:    lui a3, %hi(.LCPI4_0)
+; RV32MV-NEXT:    addi a3, a3, %lo(.LCPI4_0)
+; RV32MV-NEXT:    vle16.v v9, (a3)
+; RV32MV-NEXT:    andi a3, a1, 2047
 ; RV32MV-NEXT:    slli a2, a2, 10
-; RV32MV-NEXT:    srli a1, a1, 22
-; RV32MV-NEXT:    or a1, a1, a2
-; RV32MV-NEXT:    andi a1, a1, 2047
-; RV32MV-NEXT:    vslide1down.vx v8, v8, a1
-; RV32MV-NEXT:    lui a1, %hi(.LCPI4_0)
-; RV32MV-NEXT:    addi a1, a1, %lo(.LCPI4_0)
-; RV32MV-NEXT:    vle16.v v9, (a1)
-; RV32MV-NEXT:    vslidedown.vi v8, v8, 1
-; RV32MV-NEXT:    vid.v v10
-; RV32MV-NEXT:    vsub.vv v8, v8, v10
-; RV32MV-NEXT:    vmul.vv v8, v8, v9
-; RV32MV-NEXT:    vadd.vv v9, v8, v8
-; RV32MV-NEXT:    lui a1, 41121
-; RV32MV-NEXT:    addi a1, a1, -1527
-; RV32MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32MV-NEXT:    vmv.s.x v10, a1
-; RV32MV-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV32MV-NEXT:    vsext.vf2 v11, v10
-; RV32MV-NEXT:    vsll.vv v9, v9, v11
+; RV32MV-NEXT:    vmv.v.x v10, a3
+; RV32MV-NEXT:    srli a3, a1, 22
+; RV32MV-NEXT:    or a2, a3, a2
+; RV32MV-NEXT:    lui a3, 41121
+; RV32MV-NEXT:    slli a1, a1, 10
+; RV32MV-NEXT:    srli a1, a1, 21
+; RV32MV-NEXT:    vslide1down.vx v10, v10, a1
 ; RV32MV-NEXT:    li a1, 2047
-; RV32MV-NEXT:    vand.vx v8, v8, a1
 ; RV32MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV32MV-NEXT:    vmv.v.i v10, 1
+; RV32MV-NEXT:    vmv.v.i v11, 1
+; RV32MV-NEXT:    andi a2, a2, 2047
 ; RV32MV-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV32MV-NEXT:    vsext.vf2 v11, v10
+; RV32MV-NEXT:    vslide1down.vx v10, v10, a2
 ; RV32MV-NEXT:    lui a2, %hi(.LCPI4_1)
 ; RV32MV-NEXT:    addi a2, a2, %lo(.LCPI4_1)
+; RV32MV-NEXT:    addi a3, a3, -1527
+; RV32MV-NEXT:    vsext.vf2 v12, v11
+; RV32MV-NEXT:    vslidedown.vi v10, v10, 1
+; RV32MV-NEXT:    vsub.vv v8, v10, v8
+; RV32MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; RV32MV-NEXT:    vmv.s.x v10, a3
+; RV32MV-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; RV32MV-NEXT:    vsext.vf2 v11, v10
+; RV32MV-NEXT:    vmul.vv v8, v8, v9
+; RV32MV-NEXT:    vadd.vv v9, v8, v8
+; RV32MV-NEXT:    vsll.vv v9, v9, v11
 ; RV32MV-NEXT:    vle16.v v10, (a2)
-; RV32MV-NEXT:    vsrl.vv v8, v8, v11
+; RV32MV-NEXT:    vand.vx v8, v8, a1
+; RV32MV-NEXT:    vsrl.vv v8, v8, v12
 ; RV32MV-NEXT:    vor.vv v8, v8, v9
 ; RV32MV-NEXT:    vand.vx v8, v8, a1
 ; RV32MV-NEXT:    vmsltu.vv v0, v10, v8
 ; RV32MV-NEXT:    vmv.v.i v8, 0
 ; RV32MV-NEXT:    vmerge.vim v8, v8, -1, v0
 ; RV32MV-NEXT:    vslidedown.vi v9, v8, 2
-; RV32MV-NEXT:    vmv.x.s a1, v9
-; RV32MV-NEXT:    slli a2, a1, 21
-; RV32MV-NEXT:    srli a2, a2, 31
-; RV32MV-NEXT:    vmv.x.s a3, v8
-; RV32MV-NEXT:    andi a3, a3, 2047
+; RV32MV-NEXT:    vmv.x.s a1, v8
 ; RV32MV-NEXT:    vslidedown.vi v8, v8, 1
-; RV32MV-NEXT:    slli a1, a1, 22
-; RV32MV-NEXT:    or a1, a3, a1
+; RV32MV-NEXT:    vmv.x.s a2, v9
+; RV32MV-NEXT:    andi a1, a1, 2047
+; RV32MV-NEXT:    slli a3, a2, 22
+; RV32MV-NEXT:    or a1, a1, a3
 ; RV32MV-NEXT:    vmv.x.s a3, v8
+; RV32MV-NEXT:    slli a2, a2, 21
 ; RV32MV-NEXT:    andi a3, a3, 2047
+; RV32MV-NEXT:    srli a2, a2, 31
 ; RV32MV-NEXT:    slli a3, a3, 11
 ; RV32MV-NEXT:    or a1, a1, a3
 ; RV32MV-NEXT:    sw a1, 0(a0)
@@ -583,57 +583,57 @@ define void @test_urem_vec(ptr %X) nounwind {
 ;
 ; RV64MV-LABEL: test_urem_vec:
 ; RV64MV:       # %bb.0:
-; RV64MV-NEXT:    lbu a1, 4(a0)
-; RV64MV-NEXT:    lwu a2, 0(a0)
-; RV64MV-NEXT:    slli a1, a1, 32
-; RV64MV-NEXT:    or a1, a2, a1
+; RV64MV-NEXT:    lwu a1, 0(a0)
+; RV64MV-NEXT:    lbu a2, 4(a0)
+; RV64MV-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; RV64MV-NEXT:    vid.v v8
+; RV64MV-NEXT:    lui a3, %hi(.LCPI4_0)
+; RV64MV-NEXT:    addi a3, a3, %lo(.LCPI4_0)
+; RV64MV-NEXT:    vle16.v v9, (a3)
+; RV64MV-NEXT:    lui a3, 41121
+; RV64MV-NEXT:    slli a2, a2, 32
+; RV64MV-NEXT:    or a1, a1, a2
+; RV64MV-NEXT:    andi a2, a1, 2047
+; RV64MV-NEXT:    vmv.v.x v10, a2
 ; RV64MV-NEXT:    slli a2, a1, 42
 ; RV64MV-NEXT:    srli a2, a2, 53
-; RV64MV-NEXT:    andi a3, a1, 2047
-; RV64MV-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; RV64MV-NEXT:    vmv.v.x v8, a3
-; RV64MV-NEXT:    vslide1down.vx v8, v8, a2
-; RV64MV-NEXT:    srli a1, a1, 22
-; RV64MV-NEXT:    vslide1down.vx v8, v8, a1
-; RV64MV-NEXT:    lui a1, %hi(.LCPI4_0)
-; RV64MV-NEXT:    addi a1, a1, %lo(.LCPI4_0)
-; RV64MV-NEXT:    vle16.v v9, (a1)
-; RV64MV-NEXT:    vslidedown.vi v8, v8, 1
-; RV64MV-NEXT:    vid.v v10
-; RV64MV-NEXT:    vsub.vv v8, v8, v10
-; RV64MV-NEXT:    vmul.vv v8, v8, v9
-; RV64MV-NEXT:    vadd.vv v9, v8, v8
-; RV64MV-NEXT:    lui a1, 41121
-; RV64MV-NEXT:    addi a1, a1, -1527
+; RV64MV-NEXT:    vslide1down.vx v10, v10, a2
+; RV64MV-NEXT:    li a2, 2047
 ; RV64MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64MV-NEXT:    vmv.s.x v10, a1
+; RV64MV-NEXT:    vmv.v.i v11, 1
+; RV64MV-NEXT:    srli a1, a1, 22
 ; RV64MV-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; RV64MV-NEXT:    vsext.vf2 v11, v10
-; RV64MV-NEXT:    vsll.vv v9, v9, v11
-; RV64MV-NEXT:    li a1, 2047
-; RV64MV-NEXT:    vand.vx v8, v8, a1
+; RV64MV-NEXT:    vslide1down.vx v10, v10, a1
+; RV64MV-NEXT:    lui a1, %hi(.LCPI4_1)
+; RV64MV-NEXT:    addi a1, a1, %lo(.LCPI4_1)
+; RV64MV-NEXT:    addi a3, a3, -1527
+; RV64MV-NEXT:    vsext.vf2 v12, v11
+; RV64MV-NEXT:    vslidedown.vi v10, v10, 1
+; RV64MV-NEXT:    vsub.vv v8, v10, v8
 ; RV64MV-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; RV64MV-NEXT:    vmv.v.i v10, 1
+; RV64MV-NEXT:    vmv.s.x v10, a3
 ; RV64MV-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64MV-NEXT:    vsext.vf2 v11, v10
-; RV64MV-NEXT:    lui a2, %hi(.LCPI4_1)
-; RV64MV-NEXT:    addi a2, a2, %lo(.LCPI4_1)
-; RV64MV-NEXT:    vle16.v v10, (a2)
-; RV64MV-NEXT:    vsrl.vv v8, v8, v11
+; RV64MV-NEXT:    vmul.vv v8, v8, v9
+; RV64MV-NEXT:    vadd.vv v9, v8, v8
+; RV64MV-NEXT:    vsll.vv v9, v9, v11
+; RV64MV-NEXT:    vle16.v v10, (a1)
+; RV64MV-NEXT:    vand.vx v8, v8, a2
+; RV64MV-NEXT:    vsrl.vv v8, v8, v12
 ; RV64MV-NEXT:    vor.vv v8, v8, v9
-; RV64MV-NEXT:    vand.vx v8, v8, a1
+; RV64MV-NEXT:    vand.vx v8, v8, a2
 ; RV64MV-NEXT:    vmsltu.vv v0, v10, v8
 ; RV64MV-NEXT:    vmv.v.i v8, 0
 ; RV64MV-NEXT:    vmerge.vim v8, v8, -1, v0
 ; RV64MV-NEXT:    vmv.x.s a1, v8
-; RV64MV-NEXT:    andi a1, a1, 2047
 ; RV64MV-NEXT:    vslidedown.vi v9, v8, 1
-; RV64MV-NEXT:    vmv.x.s a2, v9
-; RV64MV-NEXT:    andi a2, a2, 2047
-; RV64MV-NEXT:    slli a2, a2, 11
 ; RV64MV-NEXT:    vslidedown.vi v8, v8, 2
+; RV64MV-NEXT:    andi a1, a1, 2047
+; RV64MV-NEXT:    vmv.x.s a2, v9
 ; RV64MV-NEXT:    vmv.x.s a3, v8
+; RV64MV-NEXT:    andi a2, a2, 2047
 ; RV64MV-NEXT:    slli a3, a3, 22
+; RV64MV-NEXT:    slli a2, a2, 11
 ; RV64MV-NEXT:    or a1, a1, a3
 ; RV64MV-NEXT:    or a1, a1, a2
 ; RV64MV-NEXT:    slli a2, a1, 31

diff  --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index 01f06474f78c23..988856ca709235 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -59,30 +59,30 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    lhu a4, 8(a1)
 ; RV32IM-NEXT:    lhu a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 8456
+; RV32IM-NEXT:    lui a6, 11038
+; RV32IM-NEXT:    li a7, 95
+; RV32IM-NEXT:    lui t0, 10700
+; RV32IM-NEXT:    li t1, 98
+; RV32IM-NEXT:    addi a6, a6, -1465
+; RV32IM-NEXT:    mulhu a6, a2, a6
+; RV32IM-NEXT:    mul a6, a6, a7
+; RV32IM-NEXT:    lui a7, 1045
+; RV32IM-NEXT:    addi t0, t0, -1003
+; RV32IM-NEXT:    mulhu t0, a4, t0
+; RV32IM-NEXT:    mul t0, t0, t1
+; RV32IM-NEXT:    li t1, 1003
 ; RV32IM-NEXT:    addi a5, a5, 1058
+; RV32IM-NEXT:    addi a7, a7, 1801
 ; RV32IM-NEXT:    mulhu a5, a3, a5
-; RV32IM-NEXT:    slli a6, a5, 7
+; RV32IM-NEXT:    mulhu a7, a1, a7
+; RV32IM-NEXT:    mul a7, a7, t1
+; RV32IM-NEXT:    slli t1, a5, 7
 ; RV32IM-NEXT:    slli a5, a5, 2
-; RV32IM-NEXT:    sub a5, a5, a6
+; RV32IM-NEXT:    sub a5, a5, t1
+; RV32IM-NEXT:    sub a2, a2, a6
+; RV32IM-NEXT:    sub a4, a4, t0
+; RV32IM-NEXT:    sub a1, a1, a7
 ; RV32IM-NEXT:    add a3, a3, a5
-; RV32IM-NEXT:    lui a5, 11038
-; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a5, a2, a5
-; RV32IM-NEXT:    li a6, 95
-; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a2, a2, a5
-; RV32IM-NEXT:    lui a5, 10700
-; RV32IM-NEXT:    addi a5, a5, -1003
-; RV32IM-NEXT:    mulhu a5, a4, a5
-; RV32IM-NEXT:    li a6, 98
-; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a4, a4, a5
-; RV32IM-NEXT:    lui a5, 1045
-; RV32IM-NEXT:    addi a5, a5, 1801
-; RV32IM-NEXT:    mulhu a5, a1, a5
-; RV32IM-NEXT:    li a6, 1003
-; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a1, a1, a5
 ; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    sh a3, 2(a0)
 ; RV32IM-NEXT:    sh a4, 4(a0)
@@ -133,38 +133,38 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: fold_urem_vec_1:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lui a2, %hi(.LCPI0_0)
-; RV64IM-NEXT:    ld a2, %lo(.LCPI0_0)(a2)
+; RV64IM-NEXT:    lhu a2, 0(a1)
 ; RV64IM-NEXT:    lhu a3, 8(a1)
-; RV64IM-NEXT:    lhu a4, 0(a1)
-; RV64IM-NEXT:    lhu a5, 16(a1)
+; RV64IM-NEXT:    lhu a4, 16(a1)
 ; RV64IM-NEXT:    lhu a1, 24(a1)
-; RV64IM-NEXT:    mulhu a2, a3, a2
-; RV64IM-NEXT:    slli a6, a2, 7
-; RV64IM-NEXT:    lui a7, %hi(.LCPI0_1)
-; RV64IM-NEXT:    ld a7, %lo(.LCPI0_1)(a7)
-; RV64IM-NEXT:    slli a2, a2, 2
-; RV64IM-NEXT:    subw a2, a2, a6
-; RV64IM-NEXT:    add a2, a3, a2
-; RV64IM-NEXT:    mulhu a3, a4, a7
-; RV64IM-NEXT:    lui a6, %hi(.LCPI0_2)
-; RV64IM-NEXT:    ld a6, %lo(.LCPI0_2)(a6)
+; RV64IM-NEXT:    lui a5, %hi(.LCPI0_0)
+; RV64IM-NEXT:    lui a6, %hi(.LCPI0_1)
 ; RV64IM-NEXT:    li a7, 95
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a4, a4, a3
-; RV64IM-NEXT:    mulhu a3, a5, a6
-; RV64IM-NEXT:    lui a6, %hi(.LCPI0_3)
-; RV64IM-NEXT:    ld a6, %lo(.LCPI0_3)(a6)
-; RV64IM-NEXT:    li a7, 98
-; RV64IM-NEXT:    mul a3, a3, a7
-; RV64IM-NEXT:    subw a5, a5, a3
-; RV64IM-NEXT:    mulhu a3, a1, a6
-; RV64IM-NEXT:    li a6, 1003
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a1, a1, a3
-; RV64IM-NEXT:    sh a4, 0(a0)
-; RV64IM-NEXT:    sh a2, 2(a0)
-; RV64IM-NEXT:    sh a5, 4(a0)
+; RV64IM-NEXT:    ld a6, %lo(.LCPI0_1)(a6)
+; RV64IM-NEXT:    lui t0, %hi(.LCPI0_2)
+; RV64IM-NEXT:    li t1, 98
+; RV64IM-NEXT:    ld t0, %lo(.LCPI0_2)(t0)
+; RV64IM-NEXT:    mulhu a6, a2, a6
+; RV64IM-NEXT:    mul a6, a6, a7
+; RV64IM-NEXT:    lui a7, %hi(.LCPI0_3)
+; RV64IM-NEXT:    ld a5, %lo(.LCPI0_0)(a5)
+; RV64IM-NEXT:    ld a7, %lo(.LCPI0_3)(a7)
+; RV64IM-NEXT:    mulhu t0, a4, t0
+; RV64IM-NEXT:    mul t0, t0, t1
+; RV64IM-NEXT:    li t1, 1003
+; RV64IM-NEXT:    mulhu a5, a3, a5
+; RV64IM-NEXT:    mulhu a7, a1, a7
+; RV64IM-NEXT:    mul a7, a7, t1
+; RV64IM-NEXT:    slli t1, a5, 7
+; RV64IM-NEXT:    slli a5, a5, 2
+; RV64IM-NEXT:    subw a5, a5, t1
+; RV64IM-NEXT:    subw a2, a2, a6
+; RV64IM-NEXT:    subw a4, a4, t0
+; RV64IM-NEXT:    subw a1, a1, a7
+; RV64IM-NEXT:    add a3, a3, a5
+; RV64IM-NEXT:    sh a2, 0(a0)
+; RV64IM-NEXT:    sh a3, 2(a0)
+; RV64IM-NEXT:    sh a4, 4(a0)
 ; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
@@ -221,19 +221,19 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    lhu a4, 8(a1)
 ; RV32IM-NEXT:    lhu a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 11038
+; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a6, a2, a5
-; RV32IM-NEXT:    li a7, 95
-; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a2, a2, a6
-; RV32IM-NEXT:    mulhu a6, a3, a5
-; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a3, a3, a6
-; RV32IM-NEXT:    mulhu a6, a4, a5
-; RV32IM-NEXT:    mul a6, a6, a7
-; RV32IM-NEXT:    sub a4, a4, a6
+; RV32IM-NEXT:    mulhu a7, a2, a5
+; RV32IM-NEXT:    mulhu t0, a3, a5
+; RV32IM-NEXT:    mulhu t1, a4, a5
 ; RV32IM-NEXT:    mulhu a5, a1, a5
-; RV32IM-NEXT:    mul a5, a5, a7
+; RV32IM-NEXT:    mul a7, a7, a6
+; RV32IM-NEXT:    mul t0, t0, a6
+; RV32IM-NEXT:    mul t1, t1, a6
+; RV32IM-NEXT:    mul a5, a5, a6
+; RV32IM-NEXT:    sub a2, a2, a7
+; RV32IM-NEXT:    sub a3, a3, t0
+; RV32IM-NEXT:    sub a4, a4, t1
 ; RV32IM-NEXT:    sub a1, a1, a5
 ; RV32IM-NEXT:    sh a2, 0(a0)
 ; RV32IM-NEXT:    sh a3, 2(a0)
@@ -291,18 +291,18 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    lhu a4, 8(a1)
 ; RV64IM-NEXT:    lhu a5, 16(a1)
 ; RV64IM-NEXT:    lhu a1, 24(a1)
-; RV64IM-NEXT:    mulhu a6, a3, a2
-; RV64IM-NEXT:    li a7, 95
-; RV64IM-NEXT:    mul a6, a6, a7
-; RV64IM-NEXT:    subw a3, a3, a6
-; RV64IM-NEXT:    mulhu a6, a4, a2
-; RV64IM-NEXT:    mul a6, a6, a7
-; RV64IM-NEXT:    subw a4, a4, a6
-; RV64IM-NEXT:    mulhu a6, a5, a2
-; RV64IM-NEXT:    mul a6, a6, a7
-; RV64IM-NEXT:    subw a5, a5, a6
+; RV64IM-NEXT:    li a6, 95
+; RV64IM-NEXT:    mulhu a7, a3, a2
+; RV64IM-NEXT:    mulhu t0, a4, a2
+; RV64IM-NEXT:    mulhu t1, a5, a2
 ; RV64IM-NEXT:    mulhu a2, a1, a2
-; RV64IM-NEXT:    mul a2, a2, a7
+; RV64IM-NEXT:    mul a7, a7, a6
+; RV64IM-NEXT:    mul t0, t0, a6
+; RV64IM-NEXT:    mul t1, t1, a6
+; RV64IM-NEXT:    mul a2, a2, a6
+; RV64IM-NEXT:    subw a3, a3, a7
+; RV64IM-NEXT:    subw a4, a4, t0
+; RV64IM-NEXT:    subw a5, a5, t1
 ; RV64IM-NEXT:    subw a1, a1, a2
 ; RV64IM-NEXT:    sh a3, 0(a0)
 ; RV64IM-NEXT:    sh a4, 2(a0)
@@ -388,33 +388,33 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: combine_urem_udiv:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 12(a1)
-; RV32IM-NEXT:    lhu a3, 0(a1)
-; RV32IM-NEXT:    lhu a4, 4(a1)
-; RV32IM-NEXT:    lhu a1, 8(a1)
+; RV32IM-NEXT:    lhu a2, 0(a1)
+; RV32IM-NEXT:    lhu a3, 4(a1)
+; RV32IM-NEXT:    lhu a4, 8(a1)
+; RV32IM-NEXT:    lhu a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 11038
+; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a6, a2, a5
-; RV32IM-NEXT:    li a7, 95
-; RV32IM-NEXT:    mul t0, a6, a7
-; RV32IM-NEXT:    mulhu t1, a1, a5
-; RV32IM-NEXT:    mul t2, t1, a7
-; RV32IM-NEXT:    mulhu t3, a4, a5
-; RV32IM-NEXT:    mul t4, t3, a7
-; RV32IM-NEXT:    mulhu a5, a3, a5
-; RV32IM-NEXT:    mul a7, a5, a7
-; RV32IM-NEXT:    add a3, a3, a5
-; RV32IM-NEXT:    sub a3, a3, a7
-; RV32IM-NEXT:    add a4, a4, t3
-; RV32IM-NEXT:    sub a4, a4, t4
-; RV32IM-NEXT:    add a1, a1, t1
+; RV32IM-NEXT:    mulhu a7, a1, a5
+; RV32IM-NEXT:    mulhu t0, a4, a5
+; RV32IM-NEXT:    mulhu t1, a3, a5
+; RV32IM-NEXT:    mulhu a5, a2, a5
+; RV32IM-NEXT:    mul t2, a7, a6
+; RV32IM-NEXT:    mul t3, t0, a6
+; RV32IM-NEXT:    mul t4, t1, a6
+; RV32IM-NEXT:    mul a6, a5, a6
+; RV32IM-NEXT:    add a2, a2, a5
+; RV32IM-NEXT:    add a3, a3, t1
+; RV32IM-NEXT:    add a4, a4, t0
+; RV32IM-NEXT:    add a1, a1, a7
+; RV32IM-NEXT:    sub a2, a2, a6
+; RV32IM-NEXT:    sub a3, a3, t4
+; RV32IM-NEXT:    sub a4, a4, t3
 ; RV32IM-NEXT:    sub a1, a1, t2
-; RV32IM-NEXT:    add a2, a2, a6
-; RV32IM-NEXT:    sub a2, a2, t0
-; RV32IM-NEXT:    sh a3, 0(a0)
-; RV32IM-NEXT:    sh a4, 2(a0)
-; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a2, 6(a0)
+; RV32IM-NEXT:    sh a2, 0(a0)
+; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
+; RV32IM-NEXT:    sh a1, 6(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: combine_urem_udiv:
@@ -489,33 +489,33 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: combine_urem_udiv:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lhu a2, 24(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI2_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI2_0)(a3)
-; RV64IM-NEXT:    lhu a4, 0(a1)
-; RV64IM-NEXT:    lhu a5, 8(a1)
-; RV64IM-NEXT:    lhu a1, 16(a1)
-; RV64IM-NEXT:    mulhu a6, a2, a3
-; RV64IM-NEXT:    li a7, 95
-; RV64IM-NEXT:    mul t0, a6, a7
-; RV64IM-NEXT:    mulhu t1, a1, a3
-; RV64IM-NEXT:    mul t2, t1, a7
-; RV64IM-NEXT:    mulhu t3, a5, a3
-; RV64IM-NEXT:    mul t4, t3, a7
-; RV64IM-NEXT:    mulhu a3, a4, a3
-; RV64IM-NEXT:    mul a7, a3, a7
-; RV64IM-NEXT:    add a3, a4, a3
-; RV64IM-NEXT:    subw a3, a3, a7
-; RV64IM-NEXT:    add a5, a5, t3
-; RV64IM-NEXT:    subw a4, a5, t4
+; RV64IM-NEXT:    lhu a2, 16(a1)
+; RV64IM-NEXT:    lhu a3, 24(a1)
+; RV64IM-NEXT:    lui a4, %hi(.LCPI2_0)
+; RV64IM-NEXT:    ld a4, %lo(.LCPI2_0)(a4)
+; RV64IM-NEXT:    lhu a5, 0(a1)
+; RV64IM-NEXT:    lhu a1, 8(a1)
+; RV64IM-NEXT:    li a6, 95
+; RV64IM-NEXT:    mulhu a7, a3, a4
+; RV64IM-NEXT:    mulhu t0, a2, a4
+; RV64IM-NEXT:    mulhu t1, a1, a4
+; RV64IM-NEXT:    mulhu a4, a5, a4
+; RV64IM-NEXT:    mul t2, a7, a6
+; RV64IM-NEXT:    mul t3, t0, a6
+; RV64IM-NEXT:    mul t4, t1, a6
+; RV64IM-NEXT:    mul a6, a4, a6
+; RV64IM-NEXT:    add a4, a5, a4
 ; RV64IM-NEXT:    add a1, a1, t1
-; RV64IM-NEXT:    subw a1, a1, t2
-; RV64IM-NEXT:    add a2, a2, a6
-; RV64IM-NEXT:    subw a2, a2, t0
-; RV64IM-NEXT:    sh a3, 0(a0)
-; RV64IM-NEXT:    sh a4, 2(a0)
-; RV64IM-NEXT:    sh a1, 4(a0)
-; RV64IM-NEXT:    sh a2, 6(a0)
+; RV64IM-NEXT:    add a2, a2, t0
+; RV64IM-NEXT:    add a3, a3, a7
+; RV64IM-NEXT:    subw a4, a4, a6
+; RV64IM-NEXT:    subw a1, a1, t4
+; RV64IM-NEXT:    subw a2, a2, t3
+; RV64IM-NEXT:    subw a3, a3, t2
+; RV64IM-NEXT:    sh a4, 0(a0)
+; RV64IM-NEXT:    sh a1, 2(a0)
+; RV64IM-NEXT:    sh a2, 4(a0)
+; RV64IM-NEXT:    sh a3, 6(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
   %2 = udiv <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
@@ -558,23 +558,23 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_urem_power_of_two:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 4(a1)
-; RV32IM-NEXT:    lhu a3, 8(a1)
-; RV32IM-NEXT:    lhu a4, 12(a1)
-; RV32IM-NEXT:    lhu a1, 0(a1)
+; RV32IM-NEXT:    lhu a2, 0(a1)
+; RV32IM-NEXT:    lhu a3, 4(a1)
+; RV32IM-NEXT:    lhu a4, 8(a1)
+; RV32IM-NEXT:    lhu a1, 12(a1)
 ; RV32IM-NEXT:    lui a5, 11038
-; RV32IM-NEXT:    addi a5, a5, -1465
-; RV32IM-NEXT:    mulhu a5, a4, a5
 ; RV32IM-NEXT:    li a6, 95
+; RV32IM-NEXT:    addi a5, a5, -1465
+; RV32IM-NEXT:    mulhu a5, a1, a5
+; RV32IM-NEXT:    andi a2, a2, 63
+; RV32IM-NEXT:    andi a3, a3, 31
+; RV32IM-NEXT:    andi a4, a4, 7
 ; RV32IM-NEXT:    mul a5, a5, a6
-; RV32IM-NEXT:    sub a4, a4, a5
-; RV32IM-NEXT:    andi a1, a1, 63
-; RV32IM-NEXT:    andi a2, a2, 31
-; RV32IM-NEXT:    andi a3, a3, 7
-; RV32IM-NEXT:    sh a1, 0(a0)
-; RV32IM-NEXT:    sh a2, 2(a0)
-; RV32IM-NEXT:    sh a3, 4(a0)
-; RV32IM-NEXT:    sh a4, 6(a0)
+; RV32IM-NEXT:    sub a1, a1, a5
+; RV32IM-NEXT:    sh a2, 0(a0)
+; RV32IM-NEXT:    sh a3, 2(a0)
+; RV32IM-NEXT:    sh a4, 4(a0)
+; RV32IM-NEXT:    sh a1, 6(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_urem_power_of_two:
@@ -610,23 +610,23 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_urem_power_of_two:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lhu a2, 24(a1)
-; RV64IM-NEXT:    lui a3, %hi(.LCPI3_0)
-; RV64IM-NEXT:    ld a3, %lo(.LCPI3_0)(a3)
+; RV64IM-NEXT:    lhu a2, 0(a1)
+; RV64IM-NEXT:    lhu a3, 8(a1)
 ; RV64IM-NEXT:    lhu a4, 16(a1)
-; RV64IM-NEXT:    lhu a5, 8(a1)
-; RV64IM-NEXT:    lhu a1, 0(a1)
-; RV64IM-NEXT:    mulhu a3, a2, a3
+; RV64IM-NEXT:    lhu a1, 24(a1)
+; RV64IM-NEXT:    lui a5, %hi(.LCPI3_0)
 ; RV64IM-NEXT:    li a6, 95
-; RV64IM-NEXT:    mul a3, a3, a6
-; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    andi a1, a1, 63
-; RV64IM-NEXT:    andi a5, a5, 31
+; RV64IM-NEXT:    ld a5, %lo(.LCPI3_0)(a5)
+; RV64IM-NEXT:    andi a2, a2, 63
+; RV64IM-NEXT:    andi a3, a3, 31
 ; RV64IM-NEXT:    andi a4, a4, 7
-; RV64IM-NEXT:    sh a1, 0(a0)
-; RV64IM-NEXT:    sh a5, 2(a0)
+; RV64IM-NEXT:    mulhu a5, a1, a5
+; RV64IM-NEXT:    mul a5, a5, a6
+; RV64IM-NEXT:    subw a1, a1, a5
+; RV64IM-NEXT:    sh a2, 0(a0)
+; RV64IM-NEXT:    sh a3, 2(a0)
 ; RV64IM-NEXT:    sh a4, 4(a0)
-; RV64IM-NEXT:    sh a2, 6(a0)
+; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95>
   ret <4 x i16> %1
@@ -676,24 +676,24 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    lhu a3, 8(a1)
 ; RV32IM-NEXT:    lhu a1, 12(a1)
 ; RV32IM-NEXT:    lui a4, 1603
+; RV32IM-NEXT:    li a5, 654
+; RV32IM-NEXT:    lui a6, 45590
+; RV32IM-NEXT:    li a7, 23
 ; RV32IM-NEXT:    addi a4, a4, 1341
 ; RV32IM-NEXT:    mulhu a4, a2, a4
-; RV32IM-NEXT:    li a5, 654
 ; RV32IM-NEXT:    mul a4, a4, a5
+; RV32IM-NEXT:    lui a5, 193
+; RV32IM-NEXT:    addi a6, a6, 1069
+; RV32IM-NEXT:    mulhu a6, a3, a6
+; RV32IM-NEXT:    mul a6, a6, a7
+; RV32IM-NEXT:    lui a7, 1
+; RV32IM-NEXT:    addi a5, a5, 1464
+; RV32IM-NEXT:    addi a7, a7, 1327
+; RV32IM-NEXT:    mulhu a5, a1, a5
+; RV32IM-NEXT:    mul a5, a5, a7
 ; RV32IM-NEXT:    sub a2, a2, a4
-; RV32IM-NEXT:    lui a4, 45590
-; RV32IM-NEXT:    addi a4, a4, 1069
-; RV32IM-NEXT:    mulhu a4, a3, a4
-; RV32IM-NEXT:    li a5, 23
-; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a3, a3, a4
-; RV32IM-NEXT:    lui a4, 193
-; RV32IM-NEXT:    addi a4, a4, 1464
-; RV32IM-NEXT:    mulhu a4, a1, a4
-; RV32IM-NEXT:    lui a5, 1
-; RV32IM-NEXT:    addi a5, a5, 1327
-; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a1, a1, a4
+; RV32IM-NEXT:    sub a3, a3, a6
+; RV32IM-NEXT:    sub a1, a1, a5
 ; RV32IM-NEXT:    sh zero, 0(a0)
 ; RV32IM-NEXT:    sh a2, 2(a0)
 ; RV32IM-NEXT:    sh a3, 4(a0)
@@ -738,31 +738,31 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_urem_one:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lui a2, %hi(.LCPI4_0)
-; RV64IM-NEXT:    ld a2, %lo(.LCPI4_0)(a2)
-; RV64IM-NEXT:    lhu a3, 8(a1)
-; RV64IM-NEXT:    lhu a4, 16(a1)
+; RV64IM-NEXT:    lhu a2, 8(a1)
+; RV64IM-NEXT:    lhu a3, 16(a1)
 ; RV64IM-NEXT:    lhu a1, 24(a1)
-; RV64IM-NEXT:    mulhu a2, a3, a2
-; RV64IM-NEXT:    lui a5, %hi(.LCPI4_1)
-; RV64IM-NEXT:    ld a5, %lo(.LCPI4_1)(a5)
-; RV64IM-NEXT:    li a6, 654
-; RV64IM-NEXT:    mul a2, a2, a6
-; RV64IM-NEXT:    subw a3, a3, a2
-; RV64IM-NEXT:    mulhu a2, a4, a5
+; RV64IM-NEXT:    lui a4, %hi(.LCPI4_0)
+; RV64IM-NEXT:    li a5, 654
+; RV64IM-NEXT:    ld a4, %lo(.LCPI4_0)(a4)
+; RV64IM-NEXT:    lui a6, %hi(.LCPI4_1)
+; RV64IM-NEXT:    li a7, 23
+; RV64IM-NEXT:    ld a6, %lo(.LCPI4_1)(a6)
+; RV64IM-NEXT:    mulhu a4, a2, a4
+; RV64IM-NEXT:    mul a4, a4, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI4_2)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI4_2)(a5)
-; RV64IM-NEXT:    li a6, 23
-; RV64IM-NEXT:    mul a2, a2, a6
-; RV64IM-NEXT:    subw a4, a4, a2
-; RV64IM-NEXT:    mulhu a2, a1, a5
-; RV64IM-NEXT:    lui a5, 1
-; RV64IM-NEXT:    addi a5, a5, 1327
-; RV64IM-NEXT:    mul a2, a2, a5
-; RV64IM-NEXT:    subw a1, a1, a2
+; RV64IM-NEXT:    mulhu a6, a3, a6
+; RV64IM-NEXT:    mul a6, a6, a7
+; RV64IM-NEXT:    lui a7, 1
+; RV64IM-NEXT:    addi a7, a7, 1327
+; RV64IM-NEXT:    mulhu a5, a1, a5
+; RV64IM-NEXT:    mul a5, a5, a7
+; RV64IM-NEXT:    subw a2, a2, a4
+; RV64IM-NEXT:    subw a3, a3, a6
+; RV64IM-NEXT:    subw a1, a1, a5
 ; RV64IM-NEXT:    sh zero, 0(a0)
-; RV64IM-NEXT:    sh a3, 2(a0)
-; RV64IM-NEXT:    sh a4, 4(a0)
+; RV64IM-NEXT:    sh a2, 2(a0)
+; RV64IM-NEXT:    sh a3, 4(a0)
 ; RV64IM-NEXT:    sh a1, 6(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
@@ -958,37 +958,37 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_urem_i64:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lui a2, %hi(.LCPI6_0)
-; RV64IM-NEXT:    ld a2, %lo(.LCPI6_0)(a2)
+; RV64IM-NEXT:    ld a2, 8(a1)
 ; RV64IM-NEXT:    ld a3, 16(a1)
-; RV64IM-NEXT:    ld a4, 8(a1)
 ; RV64IM-NEXT:    ld a1, 24(a1)
-; RV64IM-NEXT:    mulhu a2, a3, a2
-; RV64IM-NEXT:    sub a5, a3, a2
-; RV64IM-NEXT:    srli a5, a5, 1
-; RV64IM-NEXT:    add a2, a5, a2
-; RV64IM-NEXT:    srli a2, a2, 4
-; RV64IM-NEXT:    li a5, 23
-; RV64IM-NEXT:    lui a6, %hi(.LCPI6_1)
-; RV64IM-NEXT:    ld a6, %lo(.LCPI6_1)(a6)
-; RV64IM-NEXT:    mul a2, a2, a5
-; RV64IM-NEXT:    sub a3, a3, a2
-; RV64IM-NEXT:    srli a2, a4, 1
-; RV64IM-NEXT:    mulhu a2, a2, a6
-; RV64IM-NEXT:    srli a2, a2, 7
-; RV64IM-NEXT:    lui a5, %hi(.LCPI6_2)
-; RV64IM-NEXT:    ld a5, %lo(.LCPI6_2)(a5)
+; RV64IM-NEXT:    lui a4, %hi(.LCPI6_1)
+; RV64IM-NEXT:    ld a4, %lo(.LCPI6_1)(a4)
+; RV64IM-NEXT:    lui a5, %hi(.LCPI6_0)
 ; RV64IM-NEXT:    li a6, 654
-; RV64IM-NEXT:    mul a2, a2, a6
-; RV64IM-NEXT:    sub a4, a4, a2
-; RV64IM-NEXT:    mulhu a2, a1, a5
-; RV64IM-NEXT:    srli a2, a2, 12
-; RV64IM-NEXT:    lui a5, 1
-; RV64IM-NEXT:    addiw a5, a5, 1327
-; RV64IM-NEXT:    mul a2, a2, a5
-; RV64IM-NEXT:    sub a1, a1, a2
+; RV64IM-NEXT:    srli a7, a2, 1
+; RV64IM-NEXT:    mulhu a4, a7, a4
+; RV64IM-NEXT:    lui a7, %hi(.LCPI6_2)
+; RV64IM-NEXT:    ld a5, %lo(.LCPI6_0)(a5)
+; RV64IM-NEXT:    ld a7, %lo(.LCPI6_2)(a7)
+; RV64IM-NEXT:    srli a4, a4, 7
+; RV64IM-NEXT:    mul a4, a4, a6
+; RV64IM-NEXT:    lui a6, 1
+; RV64IM-NEXT:    addiw a6, a6, 1327
+; RV64IM-NEXT:    mulhu a5, a3, a5
+; RV64IM-NEXT:    mulhu a7, a1, a7
+; RV64IM-NEXT:    srli a7, a7, 12
+; RV64IM-NEXT:    mul a6, a7, a6
+; RV64IM-NEXT:    sub a7, a3, a5
+; RV64IM-NEXT:    srli a7, a7, 1
+; RV64IM-NEXT:    add a5, a7, a5
+; RV64IM-NEXT:    sub a2, a2, a4
+; RV64IM-NEXT:    sub a1, a1, a6
+; RV64IM-NEXT:    li a4, 23
+; RV64IM-NEXT:    srli a5, a5, 4
+; RV64IM-NEXT:    mul a4, a5, a4
+; RV64IM-NEXT:    sub a3, a3, a4
 ; RV64IM-NEXT:    sd zero, 0(a0)
-; RV64IM-NEXT:    sd a4, 8(a0)
+; RV64IM-NEXT:    sd a2, 8(a0)
 ; RV64IM-NEXT:    sd a3, 16(a0)
 ; RV64IM-NEXT:    sd a1, 24(a0)
 ; RV64IM-NEXT:    ret

diff  --git a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
index c76a53468f7689..b09ff9805eb97a 100644
--- a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
@@ -108,9 +108,9 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; RV32I-LABEL: func16:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a3, 16
+; RV32I-NEXT:    mul a1, a1, a2
 ; RV32I-NEXT:    addi a3, a3, -1
 ; RV32I-NEXT:    and a0, a0, a3
-; RV32I-NEXT:    mul a1, a1, a2
 ; RV32I-NEXT:    and a1, a1, a3
 ; RV32I-NEXT:    sub a1, a0, a1
 ; RV32I-NEXT:    sltu a0, a0, a1
@@ -121,9 +121,9 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; RV64I-LABEL: func16:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a3, 16
+; RV64I-NEXT:    mul a1, a1, a2
 ; RV64I-NEXT:    addiw a3, a3, -1
 ; RV64I-NEXT:    and a0, a0, a3
-; RV64I-NEXT:    mul a1, a1, a2
 ; RV64I-NEXT:    and a1, a1, a3
 ; RV64I-NEXT:    sub a1, a0, a1
 ; RV64I-NEXT:    sltu a0, a0, a1

diff  --git a/llvm/test/CodeGen/RISCV/vararg-ilp32e.ll b/llvm/test/CodeGen/RISCV/vararg-ilp32e.ll
index 281c19dc09712f..91999444fa7665 100644
--- a/llvm/test/CodeGen/RISCV/vararg-ilp32e.ll
+++ b/llvm/test/CodeGen/RISCV/vararg-ilp32e.ll
@@ -78,9 +78,9 @@ define void @va_double(i32 %n, ...) {
 ; ILP32E-NEXT:    sw a3, 20(sp)
 ; ILP32E-NEXT:    sw a4, 24(sp)
 ; ILP32E-NEXT:    addi a0, sp, 12
+; ILP32E-NEXT:    addi a1, sp, 19
 ; ILP32E-NEXT:    sw a0, 0(sp)
-; ILP32E-NEXT:    addi a0, sp, 19
-; ILP32E-NEXT:    andi a1, a0, -8
+; ILP32E-NEXT:    andi a1, a1, -8
 ; ILP32E-NEXT:    addi a0, a1, 8
 ; ILP32E-NEXT:    sw a0, 0(sp)
 ; ILP32E-NEXT:    lw a0, 0(a1)
@@ -116,9 +116,9 @@ define void @va_double(i32 %n, ...) {
 ; ILP32E-WITHFP-NEXT:    sw a3, 12(s0)
 ; ILP32E-WITHFP-NEXT:    sw a4, 16(s0)
 ; ILP32E-WITHFP-NEXT:    addi a0, s0, 4
+; ILP32E-WITHFP-NEXT:    addi a1, s0, 11
 ; ILP32E-WITHFP-NEXT:    sw a0, -12(s0)
-; ILP32E-WITHFP-NEXT:    addi a0, s0, 11
-; ILP32E-WITHFP-NEXT:    andi a1, a0, -8
+; ILP32E-WITHFP-NEXT:    andi a1, a1, -8
 ; ILP32E-WITHFP-NEXT:    addi a0, a1, 8
 ; ILP32E-WITHFP-NEXT:    sw a0, -12(s0)
 ; ILP32E-WITHFP-NEXT:    lw a0, 0(a1)

diff  --git a/llvm/test/CodeGen/RISCV/vararg.ll b/llvm/test/CodeGen/RISCV/vararg.ll
index 318b6973b724e6..895d84b38be321 100644
--- a/llvm/test/CodeGen/RISCV/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/vararg.ll
@@ -841,11 +841,11 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a3, 28(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a4, 32(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    addi a0, sp, 20
+; ILP32-ILP32F-FPELIM-NEXT:    addi a1, sp, 27
+; ILP32-ILP32F-FPELIM-NEXT:    addi a2, sp, 35
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a0, 12(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    addi a0, sp, 27
-; ILP32-ILP32F-FPELIM-NEXT:    andi a1, a0, -8
-; ILP32-ILP32F-FPELIM-NEXT:    addi a0, sp, 35
-; ILP32-ILP32F-FPELIM-NEXT:    sw a0, 12(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    andi a1, a1, -8
+; ILP32-ILP32F-FPELIM-NEXT:    sw a2, 12(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    lw a0, 0(a1)
 ; ILP32-ILP32F-FPELIM-NEXT:    lw a1, 4(a1)
 ; ILP32-ILP32F-FPELIM-NEXT:    addi sp, sp, 48
@@ -865,11 +865,11 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a3, 12(s0)
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a4, 16(s0)
 ; ILP32-ILP32F-WITHFP-NEXT:    addi a0, s0, 4
+; ILP32-ILP32F-WITHFP-NEXT:    addi a1, s0, 11
+; ILP32-ILP32F-WITHFP-NEXT:    addi a2, s0, 19
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a0, -12(s0)
-; ILP32-ILP32F-WITHFP-NEXT:    addi a0, s0, 11
-; ILP32-ILP32F-WITHFP-NEXT:    andi a1, a0, -8
-; ILP32-ILP32F-WITHFP-NEXT:    addi a0, s0, 19
-; ILP32-ILP32F-WITHFP-NEXT:    sw a0, -12(s0)
+; ILP32-ILP32F-WITHFP-NEXT:    andi a1, a1, -8
+; ILP32-ILP32F-WITHFP-NEXT:    sw a2, -12(s0)
 ; ILP32-ILP32F-WITHFP-NEXT:    lw a0, 0(a1)
 ; ILP32-ILP32F-WITHFP-NEXT:    lw a1, 4(a1)
 ; ILP32-ILP32F-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -888,11 +888,11 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a3, 28(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a4, 32(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a0, sp, 20
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a1, sp, 27
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a2, sp, 35
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a0, 12(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a0, sp, 27
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    andi a1, a0, -8
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a0, sp, 35
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a0, 12(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    andi a1, a1, -8
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a2, 12(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a0, 0(a1)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a1, 4(a1)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi sp, sp, 48
@@ -907,11 +907,11 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; ILP32E-FPELIM-NEXT:    sw a3, 16(sp)
 ; ILP32E-FPELIM-NEXT:    sw a4, 20(sp)
 ; ILP32E-FPELIM-NEXT:    addi a0, sp, 8
+; ILP32E-FPELIM-NEXT:    addi a1, sp, 15
+; ILP32E-FPELIM-NEXT:    addi a2, sp, 23
 ; ILP32E-FPELIM-NEXT:    sw a0, 0(sp)
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 15
-; ILP32E-FPELIM-NEXT:    andi a1, a0, -8
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 23
-; ILP32E-FPELIM-NEXT:    sw a0, 0(sp)
+; ILP32E-FPELIM-NEXT:    andi a1, a1, -8
+; ILP32E-FPELIM-NEXT:    sw a2, 0(sp)
 ; ILP32E-FPELIM-NEXT:    lw a0, 0(a1)
 ; ILP32E-FPELIM-NEXT:    lw a1, 4(a1)
 ; ILP32E-FPELIM-NEXT:    addi sp, sp, 28
@@ -929,11 +929,11 @@ define i64 @va2(ptr %fmt, ...) nounwind {
 ; ILP32E-WITHFP-NEXT:    sw a3, 12(s0)
 ; ILP32E-WITHFP-NEXT:    sw a4, 16(s0)
 ; ILP32E-WITHFP-NEXT:    addi a0, s0, 4
+; ILP32E-WITHFP-NEXT:    addi a1, s0, 11
+; ILP32E-WITHFP-NEXT:    addi a2, s0, 19
 ; ILP32E-WITHFP-NEXT:    sw a0, -12(s0)
-; ILP32E-WITHFP-NEXT:    addi a0, s0, 11
-; ILP32E-WITHFP-NEXT:    andi a1, a0, -8
-; ILP32E-WITHFP-NEXT:    addi a0, s0, 19
-; ILP32E-WITHFP-NEXT:    sw a0, -12(s0)
+; ILP32E-WITHFP-NEXT:    andi a1, a1, -8
+; ILP32E-WITHFP-NEXT:    sw a2, -12(s0)
 ; ILP32E-WITHFP-NEXT:    lw a0, 0(a1)
 ; ILP32E-WITHFP-NEXT:    lw a1, 4(a1)
 ; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
@@ -1040,9 +1040,9 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind {
 ; ILP32-ILP32F-FPELIM-NEXT:    addi a0, sp, 27
 ; ILP32-ILP32F-FPELIM-NEXT:    andi a1, a0, -8
 ; ILP32-ILP32F-FPELIM-NEXT:    addi a0, a1, 4
+; ILP32-ILP32F-FPELIM-NEXT:    addi a2, a1, 8
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a0, 12(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    lw a0, 0(a1)
-; ILP32-ILP32F-FPELIM-NEXT:    addi a2, a1, 8
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a2, 12(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    lw a1, 4(a1)
 ; ILP32-ILP32F-FPELIM-NEXT:    addi sp, sp, 48
@@ -1064,9 +1064,9 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind {
 ; ILP32-ILP32F-WITHFP-NEXT:    addi a0, s0, 11
 ; ILP32-ILP32F-WITHFP-NEXT:    andi a1, a0, -8
 ; ILP32-ILP32F-WITHFP-NEXT:    addi a0, a1, 4
+; ILP32-ILP32F-WITHFP-NEXT:    addi a2, a1, 8
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a0, -12(s0)
 ; ILP32-ILP32F-WITHFP-NEXT:    lw a0, 0(a1)
-; ILP32-ILP32F-WITHFP-NEXT:    addi a2, a1, 8
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a2, -12(s0)
 ; ILP32-ILP32F-WITHFP-NEXT:    lw a1, 4(a1)
 ; ILP32-ILP32F-WITHFP-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -1106,9 +1106,9 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind {
 ; ILP32E-FPELIM-NEXT:    addi a0, sp, 15
 ; ILP32E-FPELIM-NEXT:    andi a1, a0, -8
 ; ILP32E-FPELIM-NEXT:    addi a0, a1, 4
+; ILP32E-FPELIM-NEXT:    addi a2, a1, 8
 ; ILP32E-FPELIM-NEXT:    sw a0, 0(sp)
 ; ILP32E-FPELIM-NEXT:    lw a0, 0(a1)
-; ILP32E-FPELIM-NEXT:    addi a2, a1, 8
 ; ILP32E-FPELIM-NEXT:    sw a2, 0(sp)
 ; ILP32E-FPELIM-NEXT:    lw a1, 4(a1)
 ; ILP32E-FPELIM-NEXT:    addi sp, sp, 28
@@ -1128,9 +1128,9 @@ define i64 @va2_va_arg(ptr %fmt, ...) nounwind {
 ; ILP32E-WITHFP-NEXT:    addi a0, s0, 11
 ; ILP32E-WITHFP-NEXT:    andi a1, a0, -8
 ; ILP32E-WITHFP-NEXT:    addi a0, a1, 4
+; ILP32E-WITHFP-NEXT:    addi a2, a1, 8
 ; ILP32E-WITHFP-NEXT:    sw a0, -12(s0)
 ; ILP32E-WITHFP-NEXT:    lw a0, 0(a1)
-; ILP32E-WITHFP-NEXT:    addi a2, a1, 8
 ; ILP32E-WITHFP-NEXT:    sw a2, -12(s0)
 ; ILP32E-WITHFP-NEXT:    lw a1, 4(a1)
 ; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
@@ -1343,15 +1343,15 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a5, 20(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a6, 24(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    addi a0, sp, 12
+; ILP32-ILP32F-FPELIM-NEXT:    addi a3, sp, 19
+; ILP32-ILP32F-FPELIM-NEXT:    addi a4, sp, 27
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a0, 4(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    addi a0, sp, 19
-; ILP32-ILP32F-FPELIM-NEXT:    andi a0, a0, -8
-; ILP32-ILP32F-FPELIM-NEXT:    addi a3, sp, 27
-; ILP32-ILP32F-FPELIM-NEXT:    sw a3, 4(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    lw a3, 4(a0)
-; ILP32-ILP32F-FPELIM-NEXT:    lw a0, 0(a0)
-; ILP32-ILP32F-FPELIM-NEXT:    add a2, a2, a3
-; ILP32-ILP32F-FPELIM-NEXT:    add a0, a1, a0
+; ILP32-ILP32F-FPELIM-NEXT:    andi a3, a3, -8
+; ILP32-ILP32F-FPELIM-NEXT:    sw a4, 4(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    lw a0, 4(a3)
+; ILP32-ILP32F-FPELIM-NEXT:    lw a3, 0(a3)
+; ILP32-ILP32F-FPELIM-NEXT:    add a2, a2, a0
+; ILP32-ILP32F-FPELIM-NEXT:    add a0, a1, a3
 ; ILP32-ILP32F-FPELIM-NEXT:    sltu a1, a0, a1
 ; ILP32-ILP32F-FPELIM-NEXT:    add a1, a2, a1
 ; ILP32-ILP32F-FPELIM-NEXT:    addi sp, sp, 32
@@ -1369,15 +1369,15 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a5, 12(s0)
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a6, 16(s0)
 ; ILP32-ILP32F-WITHFP-NEXT:    addi a0, s0, 4
+; ILP32-ILP32F-WITHFP-NEXT:    addi a3, s0, 11
+; ILP32-ILP32F-WITHFP-NEXT:    addi a4, s0, 19
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a0, -12(s0)
-; ILP32-ILP32F-WITHFP-NEXT:    addi a0, s0, 11
-; ILP32-ILP32F-WITHFP-NEXT:    andi a0, a0, -8
-; ILP32-ILP32F-WITHFP-NEXT:    addi a3, s0, 19
-; ILP32-ILP32F-WITHFP-NEXT:    sw a3, -12(s0)
-; ILP32-ILP32F-WITHFP-NEXT:    lw a3, 4(a0)
-; ILP32-ILP32F-WITHFP-NEXT:    lw a0, 0(a0)
-; ILP32-ILP32F-WITHFP-NEXT:    add a2, a2, a3
-; ILP32-ILP32F-WITHFP-NEXT:    add a0, a1, a0
+; ILP32-ILP32F-WITHFP-NEXT:    andi a3, a3, -8
+; ILP32-ILP32F-WITHFP-NEXT:    sw a4, -12(s0)
+; ILP32-ILP32F-WITHFP-NEXT:    lw a0, 4(a3)
+; ILP32-ILP32F-WITHFP-NEXT:    lw a3, 0(a3)
+; ILP32-ILP32F-WITHFP-NEXT:    add a2, a2, a0
+; ILP32-ILP32F-WITHFP-NEXT:    add a0, a1, a3
 ; ILP32-ILP32F-WITHFP-NEXT:    sltu a1, a0, a1
 ; ILP32-ILP32F-WITHFP-NEXT:    add a1, a2, a1
 ; ILP32-ILP32F-WITHFP-NEXT:    lw ra, 20(sp) # 4-byte Folded Reload
@@ -1394,15 +1394,15 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a5, 20(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a6, 24(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a0, sp, 12
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a3, sp, 19
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a4, sp, 27
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a0, 4(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a0, sp, 19
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    andi a0, a0, -8
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a3, sp, 27
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a3, 4(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a3, 4(a0)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a0, 0(a0)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a2, a2, a3
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a0, a1, a0
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    andi a3, a3, -8
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a4, 4(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a0, 4(a3)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a3, 0(a3)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a2, a2, a0
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a0, a1, a3
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sltu a1, a0, a1
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a1, a2, a1
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi sp, sp, 32
@@ -1415,15 +1415,15 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; ILP32E-FPELIM-NEXT:    sw a4, 12(sp)
 ; ILP32E-FPELIM-NEXT:    sw a5, 16(sp)
 ; ILP32E-FPELIM-NEXT:    addi a0, sp, 8
+; ILP32E-FPELIM-NEXT:    addi a3, sp, 15
+; ILP32E-FPELIM-NEXT:    addi a4, sp, 23
 ; ILP32E-FPELIM-NEXT:    sw a0, 0(sp)
-; ILP32E-FPELIM-NEXT:    addi a0, sp, 15
-; ILP32E-FPELIM-NEXT:    andi a0, a0, -8
-; ILP32E-FPELIM-NEXT:    addi a3, sp, 23
-; ILP32E-FPELIM-NEXT:    sw a3, 0(sp)
-; ILP32E-FPELIM-NEXT:    lw a3, 4(a0)
-; ILP32E-FPELIM-NEXT:    lw a0, 0(a0)
-; ILP32E-FPELIM-NEXT:    add a2, a2, a3
-; ILP32E-FPELIM-NEXT:    add a0, a1, a0
+; ILP32E-FPELIM-NEXT:    andi a3, a3, -8
+; ILP32E-FPELIM-NEXT:    sw a4, 0(sp)
+; ILP32E-FPELIM-NEXT:    lw a0, 4(a3)
+; ILP32E-FPELIM-NEXT:    lw a3, 0(a3)
+; ILP32E-FPELIM-NEXT:    add a2, a2, a0
+; ILP32E-FPELIM-NEXT:    add a0, a1, a3
 ; ILP32E-FPELIM-NEXT:    sltu a1, a0, a1
 ; ILP32E-FPELIM-NEXT:    add a1, a2, a1
 ; ILP32E-FPELIM-NEXT:    addi sp, sp, 20
@@ -1439,15 +1439,15 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; ILP32E-WITHFP-NEXT:    sw a4, 8(s0)
 ; ILP32E-WITHFP-NEXT:    sw a5, 12(s0)
 ; ILP32E-WITHFP-NEXT:    addi a0, s0, 4
+; ILP32E-WITHFP-NEXT:    addi a3, s0, 11
+; ILP32E-WITHFP-NEXT:    addi a4, s0, 19
 ; ILP32E-WITHFP-NEXT:    sw a0, -12(s0)
-; ILP32E-WITHFP-NEXT:    addi a0, s0, 11
-; ILP32E-WITHFP-NEXT:    andi a0, a0, -8
-; ILP32E-WITHFP-NEXT:    addi a3, s0, 19
-; ILP32E-WITHFP-NEXT:    sw a3, -12(s0)
-; ILP32E-WITHFP-NEXT:    lw a3, 4(a0)
-; ILP32E-WITHFP-NEXT:    lw a0, 0(a0)
-; ILP32E-WITHFP-NEXT:    add a2, a2, a3
-; ILP32E-WITHFP-NEXT:    add a0, a1, a0
+; ILP32E-WITHFP-NEXT:    andi a3, a3, -8
+; ILP32E-WITHFP-NEXT:    sw a4, -12(s0)
+; ILP32E-WITHFP-NEXT:    lw a0, 4(a3)
+; ILP32E-WITHFP-NEXT:    lw a3, 0(a3)
+; ILP32E-WITHFP-NEXT:    add a2, a2, a0
+; ILP32E-WITHFP-NEXT:    add a0, a1, a3
 ; ILP32E-WITHFP-NEXT:    sltu a1, a0, a1
 ; ILP32E-WITHFP-NEXT:    add a1, a2, a1
 ; ILP32E-WITHFP-NEXT:    lw ra, 8(sp) # 4-byte Folded Reload
@@ -1549,9 +1549,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ; ILP32-ILP32F-FPELIM-NEXT:    addi a0, sp, 19
 ; ILP32-ILP32F-FPELIM-NEXT:    andi a0, a0, -8
 ; ILP32-ILP32F-FPELIM-NEXT:    addi a3, a0, 4
+; ILP32-ILP32F-FPELIM-NEXT:    addi a4, a0, 8
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a3, 4(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    lw a3, 0(a0)
-; ILP32-ILP32F-FPELIM-NEXT:    addi a4, a0, 8
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a4, 4(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    lw a4, 4(a0)
 ; ILP32-ILP32F-FPELIM-NEXT:    add a0, a1, a3
@@ -1575,9 +1575,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ; ILP32-ILP32F-WITHFP-NEXT:    addi a0, s0, 11
 ; ILP32-ILP32F-WITHFP-NEXT:    andi a0, a0, -8
 ; ILP32-ILP32F-WITHFP-NEXT:    addi a3, a0, 4
+; ILP32-ILP32F-WITHFP-NEXT:    addi a4, a0, 8
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a3, -12(s0)
 ; ILP32-ILP32F-WITHFP-NEXT:    lw a3, 0(a0)
-; ILP32-ILP32F-WITHFP-NEXT:    addi a4, a0, 8
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a4, -12(s0)
 ; ILP32-ILP32F-WITHFP-NEXT:    lw a4, 4(a0)
 ; ILP32-ILP32F-WITHFP-NEXT:    add a0, a1, a3
@@ -1621,9 +1621,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ; ILP32E-FPELIM-NEXT:    addi a0, sp, 15
 ; ILP32E-FPELIM-NEXT:    andi a0, a0, -8
 ; ILP32E-FPELIM-NEXT:    addi a3, a0, 4
+; ILP32E-FPELIM-NEXT:    addi a4, a0, 8
 ; ILP32E-FPELIM-NEXT:    sw a3, 0(sp)
 ; ILP32E-FPELIM-NEXT:    lw a3, 0(a0)
-; ILP32E-FPELIM-NEXT:    addi a4, a0, 8
 ; ILP32E-FPELIM-NEXT:    sw a4, 0(sp)
 ; ILP32E-FPELIM-NEXT:    lw a4, 4(a0)
 ; ILP32E-FPELIM-NEXT:    add a0, a1, a3
@@ -1645,9 +1645,9 @@ define i64 @va3_va_arg(i32 %a, i64 %b, ...) nounwind {
 ; ILP32E-WITHFP-NEXT:    addi a0, s0, 11
 ; ILP32E-WITHFP-NEXT:    andi a0, a0, -8
 ; ILP32E-WITHFP-NEXT:    addi a3, a0, 4
+; ILP32E-WITHFP-NEXT:    addi a4, a0, 8
 ; ILP32E-WITHFP-NEXT:    sw a3, -12(s0)
 ; ILP32E-WITHFP-NEXT:    lw a3, 0(a0)
-; ILP32E-WITHFP-NEXT:    addi a4, a0, 8
 ; ILP32E-WITHFP-NEXT:    sw a4, -12(s0)
 ; ILP32E-WITHFP-NEXT:    lw a4, 4(a0)
 ; ILP32E-WITHFP-NEXT:    add a0, a1, a3
@@ -1815,8 +1815,8 @@ define void @va3_caller() nounwind {
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addi sp, sp, -16
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    li a2, 1
-; LP64-LP64F-LP64D-FPELIM-NEXT:    slli a2, a2, 62
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    li a0, 2
+; LP64-LP64F-LP64D-FPELIM-NEXT:    slli a2, a2, 62
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    li a1, 1111
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    call va3
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -1830,8 +1830,8 @@ define void @va3_caller() nounwind {
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    addi s0, sp, 16
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    li a2, 1
-; LP64-LP64F-LP64D-WITHFP-NEXT:    slli a2, a2, 62
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    li a0, 2
+; LP64-LP64F-LP64D-WITHFP-NEXT:    slli a2, a2, 62
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    li a1, 1111
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    call va3
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -1844,8 +1844,8 @@ define void @va3_caller() nounwind {
 ; LP64E-FPELIM-NEXT:    addi sp, sp, -8
 ; LP64E-FPELIM-NEXT:    sd ra, 0(sp) # 8-byte Folded Spill
 ; LP64E-FPELIM-NEXT:    li a2, 1
-; LP64E-FPELIM-NEXT:    slli a2, a2, 62
 ; LP64E-FPELIM-NEXT:    li a0, 2
+; LP64E-FPELIM-NEXT:    slli a2, a2, 62
 ; LP64E-FPELIM-NEXT:    li a1, 1111
 ; LP64E-FPELIM-NEXT:    call va3
 ; LP64E-FPELIM-NEXT:    ld ra, 0(sp) # 8-byte Folded Reload
@@ -1859,8 +1859,8 @@ define void @va3_caller() nounwind {
 ; LP64E-WITHFP-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
 ; LP64E-WITHFP-NEXT:    addi s0, sp, 16
 ; LP64E-WITHFP-NEXT:    li a2, 1
-; LP64E-WITHFP-NEXT:    slli a2, a2, 62
 ; LP64E-WITHFP-NEXT:    li a0, 2
+; LP64E-WITHFP-NEXT:    slli a2, a2, 62
 ; LP64E-WITHFP-NEXT:    li a1, 1111
 ; LP64E-WITHFP-NEXT:    call va3
 ; LP64E-WITHFP-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -1895,21 +1895,21 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; ILP32-ILP32F-FPELIM-NEXT:    addi a0, a0, 3
 ; ILP32-ILP32F-FPELIM-NEXT:    andi a0, a0, -4
 ; ILP32-ILP32F-FPELIM-NEXT:    addi a1, a0, 4
+; ILP32-ILP32F-FPELIM-NEXT:    addi a2, a0, 7
 ; ILP32-ILP32F-FPELIM-NEXT:    sw a1, 4(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    lw a1, 0(a0)
-; ILP32-ILP32F-FPELIM-NEXT:    addi a0, a0, 7
-; ILP32-ILP32F-FPELIM-NEXT:    andi a0, a0, -4
-; ILP32-ILP32F-FPELIM-NEXT:    addi a2, a0, 4
-; ILP32-ILP32F-FPELIM-NEXT:    sw a2, 4(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    lw a2, 0(a0)
-; ILP32-ILP32F-FPELIM-NEXT:    addi a0, a0, 7
-; ILP32-ILP32F-FPELIM-NEXT:    andi a0, a0, -4
-; ILP32-ILP32F-FPELIM-NEXT:    addi a3, a0, 4
-; ILP32-ILP32F-FPELIM-NEXT:    sw a3, 4(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    andi a2, a2, -4
 ; ILP32-ILP32F-FPELIM-NEXT:    lw a0, 0(a0)
-; ILP32-ILP32F-FPELIM-NEXT:    add a1, a1, s0
-; ILP32-ILP32F-FPELIM-NEXT:    add a1, a1, a2
-; ILP32-ILP32F-FPELIM-NEXT:    add a0, a1, a0
+; ILP32-ILP32F-FPELIM-NEXT:    addi a1, a2, 4
+; ILP32-ILP32F-FPELIM-NEXT:    addi a3, a2, 7
+; ILP32-ILP32F-FPELIM-NEXT:    sw a1, 4(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    andi a3, a3, -4
+; ILP32-ILP32F-FPELIM-NEXT:    lw a1, 0(a2)
+; ILP32-ILP32F-FPELIM-NEXT:    addi a2, a3, 4
+; ILP32-ILP32F-FPELIM-NEXT:    sw a2, 4(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    lw a2, 0(a3)
+; ILP32-ILP32F-FPELIM-NEXT:    add a0, a0, s0
+; ILP32-ILP32F-FPELIM-NEXT:    add a0, a0, a1
+; ILP32-ILP32F-FPELIM-NEXT:    add a0, a0, a2
 ; ILP32-ILP32F-FPELIM-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; ILP32-ILP32F-FPELIM-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; ILP32-ILP32F-FPELIM-NEXT:    addi sp, sp, 48
@@ -1938,21 +1938,21 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; ILP32-ILP32F-WITHFP-NEXT:    addi a0, a0, 3
 ; ILP32-ILP32F-WITHFP-NEXT:    andi a0, a0, -4
 ; ILP32-ILP32F-WITHFP-NEXT:    addi a1, a0, 4
+; ILP32-ILP32F-WITHFP-NEXT:    addi a2, a0, 7
 ; ILP32-ILP32F-WITHFP-NEXT:    sw a1, -16(s0)
-; ILP32-ILP32F-WITHFP-NEXT:    lw a1, 0(a0)
-; ILP32-ILP32F-WITHFP-NEXT:    addi a0, a0, 7
-; ILP32-ILP32F-WITHFP-NEXT:    andi a0, a0, -4
-; ILP32-ILP32F-WITHFP-NEXT:    addi a2, a0, 4
-; ILP32-ILP32F-WITHFP-NEXT:    sw a2, -16(s0)
-; ILP32-ILP32F-WITHFP-NEXT:    lw a2, 0(a0)
-; ILP32-ILP32F-WITHFP-NEXT:    addi a0, a0, 7
-; ILP32-ILP32F-WITHFP-NEXT:    andi a0, a0, -4
-; ILP32-ILP32F-WITHFP-NEXT:    addi a3, a0, 4
-; ILP32-ILP32F-WITHFP-NEXT:    sw a3, -16(s0)
+; ILP32-ILP32F-WITHFP-NEXT:    andi a2, a2, -4
 ; ILP32-ILP32F-WITHFP-NEXT:    lw a0, 0(a0)
-; ILP32-ILP32F-WITHFP-NEXT:    add a1, a1, s1
-; ILP32-ILP32F-WITHFP-NEXT:    add a1, a1, a2
-; ILP32-ILP32F-WITHFP-NEXT:    add a0, a1, a0
+; ILP32-ILP32F-WITHFP-NEXT:    addi a1, a2, 4
+; ILP32-ILP32F-WITHFP-NEXT:    addi a3, a2, 7
+; ILP32-ILP32F-WITHFP-NEXT:    sw a1, -16(s0)
+; ILP32-ILP32F-WITHFP-NEXT:    andi a3, a3, -4
+; ILP32-ILP32F-WITHFP-NEXT:    lw a1, 0(a2)
+; ILP32-ILP32F-WITHFP-NEXT:    addi a2, a3, 4
+; ILP32-ILP32F-WITHFP-NEXT:    sw a2, -16(s0)
+; ILP32-ILP32F-WITHFP-NEXT:    lw a2, 0(a3)
+; ILP32-ILP32F-WITHFP-NEXT:    add a0, a0, s1
+; ILP32-ILP32F-WITHFP-NEXT:    add a0, a0, a1
+; ILP32-ILP32F-WITHFP-NEXT:    add a0, a0, a2
 ; ILP32-ILP32F-WITHFP-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; ILP32-ILP32F-WITHFP-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; ILP32-ILP32F-WITHFP-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -1980,21 +1980,21 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a0, a0, 3
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    andi a0, a0, -4
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a1, a0, 4
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a2, a0, 7
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a1, 4(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a1, 0(a0)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a0, a0, 7
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    andi a0, a0, -4
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a2, a0, 4
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a2, 4(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a2, 0(a0)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a0, a0, 7
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    andi a0, a0, -4
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a3, a0, 4
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a3, 4(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    andi a2, a2, -4
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a0, 0(a0)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a1, a1, s0
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a1, a1, a2
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a0, a1, a0
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a1, a2, 4
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a3, a2, 7
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a1, 4(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    andi a3, a3, -4
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a1, 0(a2)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a2, a3, 4
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a2, 4(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw a2, 0(a3)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a0, a0, s0
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a0, a0, a1
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    add a0, a0, a2
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi sp, sp, 48
@@ -2019,21 +2019,21 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; ILP32E-FPELIM-NEXT:    addi a0, a0, 3
 ; ILP32E-FPELIM-NEXT:    andi a0, a0, -4
 ; ILP32E-FPELIM-NEXT:    addi a1, a0, 4
+; ILP32E-FPELIM-NEXT:    addi a2, a0, 7
 ; ILP32E-FPELIM-NEXT:    sw a1, 4(sp)
-; ILP32E-FPELIM-NEXT:    lw a1, 0(a0)
-; ILP32E-FPELIM-NEXT:    addi a0, a0, 7
-; ILP32E-FPELIM-NEXT:    andi a0, a0, -4
-; ILP32E-FPELIM-NEXT:    addi a2, a0, 4
-; ILP32E-FPELIM-NEXT:    sw a2, 4(sp)
-; ILP32E-FPELIM-NEXT:    lw a2, 0(a0)
-; ILP32E-FPELIM-NEXT:    addi a0, a0, 7
-; ILP32E-FPELIM-NEXT:    andi a0, a0, -4
-; ILP32E-FPELIM-NEXT:    addi a3, a0, 4
-; ILP32E-FPELIM-NEXT:    sw a3, 4(sp)
+; ILP32E-FPELIM-NEXT:    andi a2, a2, -4
 ; ILP32E-FPELIM-NEXT:    lw a0, 0(a0)
-; ILP32E-FPELIM-NEXT:    add a1, a1, s0
-; ILP32E-FPELIM-NEXT:    add a1, a1, a2
-; ILP32E-FPELIM-NEXT:    add a0, a1, a0
+; ILP32E-FPELIM-NEXT:    addi a1, a2, 4
+; ILP32E-FPELIM-NEXT:    addi a3, a2, 7
+; ILP32E-FPELIM-NEXT:    sw a1, 4(sp)
+; ILP32E-FPELIM-NEXT:    andi a3, a3, -4
+; ILP32E-FPELIM-NEXT:    lw a1, 0(a2)
+; ILP32E-FPELIM-NEXT:    addi a2, a3, 4
+; ILP32E-FPELIM-NEXT:    sw a2, 4(sp)
+; ILP32E-FPELIM-NEXT:    lw a2, 0(a3)
+; ILP32E-FPELIM-NEXT:    add a0, a0, s0
+; ILP32E-FPELIM-NEXT:    add a0, a0, a1
+; ILP32E-FPELIM-NEXT:    add a0, a0, a2
 ; ILP32E-FPELIM-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
 ; ILP32E-FPELIM-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
 ; ILP32E-FPELIM-NEXT:    addi sp, sp, 40
@@ -2060,21 +2060,21 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; ILP32E-WITHFP-NEXT:    addi a0, a0, 3
 ; ILP32E-WITHFP-NEXT:    andi a0, a0, -4
 ; ILP32E-WITHFP-NEXT:    addi a1, a0, 4
+; ILP32E-WITHFP-NEXT:    addi a2, a0, 7
 ; ILP32E-WITHFP-NEXT:    sw a1, -16(s0)
-; ILP32E-WITHFP-NEXT:    lw a1, 0(a0)
-; ILP32E-WITHFP-NEXT:    addi a0, a0, 7
-; ILP32E-WITHFP-NEXT:    andi a0, a0, -4
-; ILP32E-WITHFP-NEXT:    addi a2, a0, 4
-; ILP32E-WITHFP-NEXT:    sw a2, -16(s0)
-; ILP32E-WITHFP-NEXT:    lw a2, 0(a0)
-; ILP32E-WITHFP-NEXT:    addi a0, a0, 7
-; ILP32E-WITHFP-NEXT:    andi a0, a0, -4
-; ILP32E-WITHFP-NEXT:    addi a3, a0, 4
-; ILP32E-WITHFP-NEXT:    sw a3, -16(s0)
+; ILP32E-WITHFP-NEXT:    andi a2, a2, -4
 ; ILP32E-WITHFP-NEXT:    lw a0, 0(a0)
-; ILP32E-WITHFP-NEXT:    add a1, a1, s1
-; ILP32E-WITHFP-NEXT:    add a1, a1, a2
-; ILP32E-WITHFP-NEXT:    add a0, a1, a0
+; ILP32E-WITHFP-NEXT:    addi a1, a2, 4
+; ILP32E-WITHFP-NEXT:    addi a3, a2, 7
+; ILP32E-WITHFP-NEXT:    sw a1, -16(s0)
+; ILP32E-WITHFP-NEXT:    andi a3, a3, -4
+; ILP32E-WITHFP-NEXT:    lw a1, 0(a2)
+; ILP32E-WITHFP-NEXT:    addi a2, a3, 4
+; ILP32E-WITHFP-NEXT:    sw a2, -16(s0)
+; ILP32E-WITHFP-NEXT:    lw a2, 0(a3)
+; ILP32E-WITHFP-NEXT:    add a0, a0, s1
+; ILP32E-WITHFP-NEXT:    add a0, a0, a1
+; ILP32E-WITHFP-NEXT:    add a0, a0, a2
 ; ILP32E-WITHFP-NEXT:    lw ra, 16(sp) # 4-byte Folded Reload
 ; ILP32E-WITHFP-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; ILP32E-WITHFP-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
@@ -2102,21 +2102,21 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a0, a0, 3
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    andi a0, a0, -4
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a1, a0, 8
+; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a2, a0, 11
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    ld a1, 0(a0)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a0, a0, 11
-; LP64-LP64F-LP64D-FPELIM-NEXT:    andi a0, a0, -4
-; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a2, a0, 8
-; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a2, 8(sp)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    ld a2, 0(a0)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a0, a0, 11
-; LP64-LP64F-LP64D-FPELIM-NEXT:    andi a0, a0, -4
-; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a3, a0, 8
-; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a3, 8(sp)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    andi a2, a2, -4
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    ld a0, 0(a0)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    add a1, a1, s0
-; LP64-LP64F-LP64D-FPELIM-NEXT:    add a1, a1, a2
-; LP64-LP64F-LP64D-FPELIM-NEXT:    addw a0, a1, a0
+; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a1, a2, 8
+; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a3, a2, 11
+; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a1, 8(sp)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    andi a3, a3, -4
+; LP64-LP64F-LP64D-FPELIM-NEXT:    ld a1, 0(a2)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a2, a3, 8
+; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a2, 8(sp)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    ld a2, 0(a3)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    add a0, a0, s0
+; LP64-LP64F-LP64D-FPELIM-NEXT:    add a0, a0, a1
+; LP64-LP64F-LP64D-FPELIM-NEXT:    addw a0, a0, a2
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addi sp, sp, 96
@@ -2145,21 +2145,21 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a0, a0, 3
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    andi a0, a0, -4
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a1, a0, 8
+; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a2, a0, 11
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a1, -32(s0)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    ld a1, 0(a0)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a0, a0, 11
-; LP64-LP64F-LP64D-WITHFP-NEXT:    andi a0, a0, -4
-; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a2, a0, 8
-; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a2, -32(s0)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    ld a2, 0(a0)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a0, a0, 11
-; LP64-LP64F-LP64D-WITHFP-NEXT:    andi a0, a0, -4
-; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a3, a0, 8
-; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a3, -32(s0)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    andi a2, a2, -4
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld a0, 0(a0)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    add a1, a1, s1
-; LP64-LP64F-LP64D-WITHFP-NEXT:    add a1, a1, a2
-; LP64-LP64F-LP64D-WITHFP-NEXT:    addw a0, a1, a0
+; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a1, a2, 8
+; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a3, a2, 11
+; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a1, -32(s0)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    andi a3, a3, -4
+; LP64-LP64F-LP64D-WITHFP-NEXT:    ld a1, 0(a2)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a2, a3, 8
+; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a2, -32(s0)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    ld a2, 0(a3)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    add a0, a0, s1
+; LP64-LP64F-LP64D-WITHFP-NEXT:    add a0, a0, a1
+; LP64-LP64F-LP64D-WITHFP-NEXT:    addw a0, a0, a2
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -2185,21 +2185,21 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; LP64E-FPELIM-NEXT:    addi a0, a0, 3
 ; LP64E-FPELIM-NEXT:    andi a0, a0, -4
 ; LP64E-FPELIM-NEXT:    addi a1, a0, 8
+; LP64E-FPELIM-NEXT:    addi a2, a0, 11
 ; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
-; LP64E-FPELIM-NEXT:    ld a1, 0(a0)
-; LP64E-FPELIM-NEXT:    addi a0, a0, 11
-; LP64E-FPELIM-NEXT:    andi a0, a0, -4
-; LP64E-FPELIM-NEXT:    addi a2, a0, 8
-; LP64E-FPELIM-NEXT:    sd a2, 8(sp)
-; LP64E-FPELIM-NEXT:    ld a2, 0(a0)
-; LP64E-FPELIM-NEXT:    addi a0, a0, 11
-; LP64E-FPELIM-NEXT:    andi a0, a0, -4
-; LP64E-FPELIM-NEXT:    addi a3, a0, 8
-; LP64E-FPELIM-NEXT:    sd a3, 8(sp)
+; LP64E-FPELIM-NEXT:    andi a2, a2, -4
 ; LP64E-FPELIM-NEXT:    ld a0, 0(a0)
-; LP64E-FPELIM-NEXT:    add a1, a1, s0
-; LP64E-FPELIM-NEXT:    add a1, a1, a2
-; LP64E-FPELIM-NEXT:    addw a0, a1, a0
+; LP64E-FPELIM-NEXT:    addi a1, a2, 8
+; LP64E-FPELIM-NEXT:    addi a3, a2, 11
+; LP64E-FPELIM-NEXT:    sd a1, 8(sp)
+; LP64E-FPELIM-NEXT:    andi a3, a3, -4
+; LP64E-FPELIM-NEXT:    ld a1, 0(a2)
+; LP64E-FPELIM-NEXT:    addi a2, a3, 8
+; LP64E-FPELIM-NEXT:    sd a2, 8(sp)
+; LP64E-FPELIM-NEXT:    ld a2, 0(a3)
+; LP64E-FPELIM-NEXT:    add a0, a0, s0
+; LP64E-FPELIM-NEXT:    add a0, a0, a1
+; LP64E-FPELIM-NEXT:    addw a0, a0, a2
 ; LP64E-FPELIM-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
 ; LP64E-FPELIM-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; LP64E-FPELIM-NEXT:    addi sp, sp, 80
@@ -2226,21 +2226,21 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; LP64E-WITHFP-NEXT:    addi a0, a0, 3
 ; LP64E-WITHFP-NEXT:    andi a0, a0, -4
 ; LP64E-WITHFP-NEXT:    addi a1, a0, 8
+; LP64E-WITHFP-NEXT:    addi a2, a0, 11
 ; LP64E-WITHFP-NEXT:    sd a1, -32(s0)
-; LP64E-WITHFP-NEXT:    ld a1, 0(a0)
-; LP64E-WITHFP-NEXT:    addi a0, a0, 11
-; LP64E-WITHFP-NEXT:    andi a0, a0, -4
-; LP64E-WITHFP-NEXT:    addi a2, a0, 8
-; LP64E-WITHFP-NEXT:    sd a2, -32(s0)
-; LP64E-WITHFP-NEXT:    ld a2, 0(a0)
-; LP64E-WITHFP-NEXT:    addi a0, a0, 11
-; LP64E-WITHFP-NEXT:    andi a0, a0, -4
-; LP64E-WITHFP-NEXT:    addi a3, a0, 8
-; LP64E-WITHFP-NEXT:    sd a3, -32(s0)
+; LP64E-WITHFP-NEXT:    andi a2, a2, -4
 ; LP64E-WITHFP-NEXT:    ld a0, 0(a0)
-; LP64E-WITHFP-NEXT:    add a1, a1, s1
-; LP64E-WITHFP-NEXT:    add a1, a1, a2
-; LP64E-WITHFP-NEXT:    addw a0, a1, a0
+; LP64E-WITHFP-NEXT:    addi a1, a2, 8
+; LP64E-WITHFP-NEXT:    addi a3, a2, 11
+; LP64E-WITHFP-NEXT:    sd a1, -32(s0)
+; LP64E-WITHFP-NEXT:    andi a3, a3, -4
+; LP64E-WITHFP-NEXT:    ld a1, 0(a2)
+; LP64E-WITHFP-NEXT:    addi a2, a3, 8
+; LP64E-WITHFP-NEXT:    sd a2, -32(s0)
+; LP64E-WITHFP-NEXT:    ld a2, 0(a3)
+; LP64E-WITHFP-NEXT:    add a0, a0, s1
+; LP64E-WITHFP-NEXT:    add a0, a0, a1
+; LP64E-WITHFP-NEXT:    addw a0, a0, a2
 ; LP64E-WITHFP-NEXT:    ld ra, 32(sp) # 8-byte Folded Reload
 ; LP64E-WITHFP-NEXT:    ld s0, 24(sp) # 8-byte Folded Reload
 ; LP64E-WITHFP-NEXT:    ld s1, 16(sp) # 8-byte Folded Reload
@@ -2275,36 +2275,36 @@ define void @va5_aligned_stack_caller() nounwind {
 ; ILP32-ILP32F-FPELIM:       # %bb.0:
 ; ILP32-ILP32F-FPELIM-NEXT:    addi sp, sp, -64
 ; ILP32-ILP32F-FPELIM-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; ILP32-ILP32F-FPELIM-NEXT:    li a0, 17
-; ILP32-ILP32F-FPELIM-NEXT:    li a1, 16
-; ILP32-ILP32F-FPELIM-NEXT:    sw a1, 20(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    sw a0, 24(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    li a0, 15
-; ILP32-ILP32F-FPELIM-NEXT:    lui a1, 262236
-; ILP32-ILP32F-FPELIM-NEXT:    addi a1, a1, 655
-; ILP32-ILP32F-FPELIM-NEXT:    lui a2, 377487
-; ILP32-ILP32F-FPELIM-NEXT:    addi a2, a2, 1475
-; ILP32-ILP32F-FPELIM-NEXT:    li a3, 14
-; ILP32-ILP32F-FPELIM-NEXT:    sw a3, 0(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    sw a2, 8(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    sw a1, 12(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    sw a0, 16(sp)
-; ILP32-ILP32F-FPELIM-NEXT:    lui a0, 262153
-; ILP32-ILP32F-FPELIM-NEXT:    addi a5, a0, 491
-; ILP32-ILP32F-FPELIM-NEXT:    lui a0, 545260
-; ILP32-ILP32F-FPELIM-NEXT:    addi t0, a0, -1967
-; ILP32-ILP32F-FPELIM-NEXT:    lui a0, 964690
-; ILP32-ILP32F-FPELIM-NEXT:    addi t1, a0, -328
-; ILP32-ILP32F-FPELIM-NEXT:    lui a0, 335544
-; ILP32-ILP32F-FPELIM-NEXT:    addi t2, a0, 1311
-; ILP32-ILP32F-FPELIM-NEXT:    lui a0, 688509
-; ILP32-ILP32F-FPELIM-NEXT:    addi a6, a0, -2048
+; ILP32-ILP32F-FPELIM-NEXT:    li a4, 17
+; ILP32-ILP32F-FPELIM-NEXT:    li a5, 16
+; ILP32-ILP32F-FPELIM-NEXT:    li a6, 15
+; ILP32-ILP32F-FPELIM-NEXT:    lui a7, 262236
+; ILP32-ILP32F-FPELIM-NEXT:    lui t0, 377487
+; ILP32-ILP32F-FPELIM-NEXT:    li t1, 14
+; ILP32-ILP32F-FPELIM-NEXT:    lui t2, 262153
+; ILP32-ILP32F-FPELIM-NEXT:    lui t3, 545260
+; ILP32-ILP32F-FPELIM-NEXT:    lui t4, 964690
+; ILP32-ILP32F-FPELIM-NEXT:    lui t5, 335544
+; ILP32-ILP32F-FPELIM-NEXT:    lui t6, 688509
 ; ILP32-ILP32F-FPELIM-NEXT:    li a0, 1
 ; ILP32-ILP32F-FPELIM-NEXT:    li a1, 11
 ; ILP32-ILP32F-FPELIM-NEXT:    addi a2, sp, 32
 ; ILP32-ILP32F-FPELIM-NEXT:    li a3, 12
+; ILP32-ILP32F-FPELIM-NEXT:    sw a5, 20(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    sw a4, 24(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    li a4, 13
+; ILP32-ILP32F-FPELIM-NEXT:    addi a5, a7, 655
+; ILP32-ILP32F-FPELIM-NEXT:    addi a7, t0, 1475
+; ILP32-ILP32F-FPELIM-NEXT:    sw t1, 0(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    sw a7, 8(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    sw a5, 12(sp)
+; ILP32-ILP32F-FPELIM-NEXT:    sw a6, 16(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    li a7, 4
+; ILP32-ILP32F-FPELIM-NEXT:    addi a5, t2, 491
+; ILP32-ILP32F-FPELIM-NEXT:    addi t0, t3, -1967
+; ILP32-ILP32F-FPELIM-NEXT:    addi t1, t4, -328
+; ILP32-ILP32F-FPELIM-NEXT:    addi t2, t5, 1311
+; ILP32-ILP32F-FPELIM-NEXT:    addi a6, t6, -2048
 ; ILP32-ILP32F-FPELIM-NEXT:    sw t2, 32(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    sw t1, 36(sp)
 ; ILP32-ILP32F-FPELIM-NEXT:    sw t0, 40(sp)
@@ -2320,36 +2320,36 @@ define void @va5_aligned_stack_caller() nounwind {
 ; ILP32-ILP32F-WITHFP-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
 ; ILP32-ILP32F-WITHFP-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
 ; ILP32-ILP32F-WITHFP-NEXT:    addi s0, sp, 64
-; ILP32-ILP32F-WITHFP-NEXT:    li a0, 17
-; ILP32-ILP32F-WITHFP-NEXT:    li a1, 16
-; ILP32-ILP32F-WITHFP-NEXT:    sw a1, 20(sp)
-; ILP32-ILP32F-WITHFP-NEXT:    sw a0, 24(sp)
-; ILP32-ILP32F-WITHFP-NEXT:    li a0, 15
-; ILP32-ILP32F-WITHFP-NEXT:    lui a1, 262236
-; ILP32-ILP32F-WITHFP-NEXT:    addi a1, a1, 655
-; ILP32-ILP32F-WITHFP-NEXT:    lui a2, 377487
-; ILP32-ILP32F-WITHFP-NEXT:    addi a2, a2, 1475
-; ILP32-ILP32F-WITHFP-NEXT:    li a3, 14
-; ILP32-ILP32F-WITHFP-NEXT:    sw a3, 0(sp)
-; ILP32-ILP32F-WITHFP-NEXT:    sw a2, 8(sp)
-; ILP32-ILP32F-WITHFP-NEXT:    sw a1, 12(sp)
-; ILP32-ILP32F-WITHFP-NEXT:    sw a0, 16(sp)
-; ILP32-ILP32F-WITHFP-NEXT:    lui a0, 262153
-; ILP32-ILP32F-WITHFP-NEXT:    addi a5, a0, 491
-; ILP32-ILP32F-WITHFP-NEXT:    lui a0, 545260
-; ILP32-ILP32F-WITHFP-NEXT:    addi t0, a0, -1967
-; ILP32-ILP32F-WITHFP-NEXT:    lui a0, 964690
-; ILP32-ILP32F-WITHFP-NEXT:    addi t1, a0, -328
-; ILP32-ILP32F-WITHFP-NEXT:    lui a0, 335544
-; ILP32-ILP32F-WITHFP-NEXT:    addi t2, a0, 1311
-; ILP32-ILP32F-WITHFP-NEXT:    lui a0, 688509
-; ILP32-ILP32F-WITHFP-NEXT:    addi a6, a0, -2048
+; ILP32-ILP32F-WITHFP-NEXT:    li a4, 17
+; ILP32-ILP32F-WITHFP-NEXT:    li a5, 16
+; ILP32-ILP32F-WITHFP-NEXT:    li a6, 15
+; ILP32-ILP32F-WITHFP-NEXT:    lui a7, 262236
+; ILP32-ILP32F-WITHFP-NEXT:    lui t0, 377487
+; ILP32-ILP32F-WITHFP-NEXT:    li t1, 14
+; ILP32-ILP32F-WITHFP-NEXT:    lui t2, 262153
+; ILP32-ILP32F-WITHFP-NEXT:    lui t3, 545260
+; ILP32-ILP32F-WITHFP-NEXT:    lui t4, 964690
+; ILP32-ILP32F-WITHFP-NEXT:    lui t5, 335544
+; ILP32-ILP32F-WITHFP-NEXT:    lui t6, 688509
 ; ILP32-ILP32F-WITHFP-NEXT:    li a0, 1
 ; ILP32-ILP32F-WITHFP-NEXT:    li a1, 11
 ; ILP32-ILP32F-WITHFP-NEXT:    addi a2, s0, -32
 ; ILP32-ILP32F-WITHFP-NEXT:    li a3, 12
+; ILP32-ILP32F-WITHFP-NEXT:    sw a5, 20(sp)
+; ILP32-ILP32F-WITHFP-NEXT:    sw a4, 24(sp)
 ; ILP32-ILP32F-WITHFP-NEXT:    li a4, 13
+; ILP32-ILP32F-WITHFP-NEXT:    addi a5, a7, 655
+; ILP32-ILP32F-WITHFP-NEXT:    addi a7, t0, 1475
+; ILP32-ILP32F-WITHFP-NEXT:    sw t1, 0(sp)
+; ILP32-ILP32F-WITHFP-NEXT:    sw a7, 8(sp)
+; ILP32-ILP32F-WITHFP-NEXT:    sw a5, 12(sp)
+; ILP32-ILP32F-WITHFP-NEXT:    sw a6, 16(sp)
 ; ILP32-ILP32F-WITHFP-NEXT:    li a7, 4
+; ILP32-ILP32F-WITHFP-NEXT:    addi a5, t2, 491
+; ILP32-ILP32F-WITHFP-NEXT:    addi t0, t3, -1967
+; ILP32-ILP32F-WITHFP-NEXT:    addi t1, t4, -328
+; ILP32-ILP32F-WITHFP-NEXT:    addi t2, t5, 1311
+; ILP32-ILP32F-WITHFP-NEXT:    addi a6, t6, -2048
 ; ILP32-ILP32F-WITHFP-NEXT:    sw t2, -32(s0)
 ; ILP32-ILP32F-WITHFP-NEXT:    sw t1, -28(s0)
 ; ILP32-ILP32F-WITHFP-NEXT:    sw t0, -24(s0)
@@ -2364,36 +2364,36 @@ define void @va5_aligned_stack_caller() nounwind {
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM:       # %bb.0:
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi sp, sp, -64
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui a0, 262236
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a0, a0, 655
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui a1, 377487
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a1, a1, 1475
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a2, 17
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a3, 16
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a3, 20(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a2, 24(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a2, 15
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a3, 14
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a3, 0(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a1, 8(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a0, 12(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a2, 16(sp)
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui a0, 262153
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a5, a0, 491
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui a0, 545260
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi t0, a0, -1967
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui a0, 964690
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi t1, a0, -328
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui a0, 335544
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi t2, a0, 1311
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui a0, 688509
-; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a6, a0, -2048
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui a5, 262236
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui a6, 377487
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a4, 17
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a7, 16
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li t0, 15
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li t1, 14
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui t2, 262153
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui t3, 545260
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui t4, 964690
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui t5, 335544
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    lui t6, 688509
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a0, 1
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a1, 11
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a2, sp, 32
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a3, 12
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a7, 20(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a4, 24(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a4, 13
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a5, a5, 655
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a6, a6, 1475
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw t1, 0(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a6, 8(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw a5, 12(sp)
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw t0, 16(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    li a7, 4
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a5, t2, 491
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi t0, t3, -1967
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi t1, t4, -328
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi t2, t5, 1311
+; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    addi a6, t6, -2048
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw t2, 32(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw t1, 36(sp)
 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT:    sw t0, 40(sp)
@@ -2410,37 +2410,37 @@ define void @va5_aligned_stack_caller() nounwind {
 ; ILP32E-FPELIM-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
 ; ILP32E-FPELIM-NEXT:    addi s0, sp, 64
 ; ILP32E-FPELIM-NEXT:    andi sp, sp, -16
-; ILP32E-FPELIM-NEXT:    li a0, 17
-; ILP32E-FPELIM-NEXT:    li a1, 16
-; ILP32E-FPELIM-NEXT:    li a2, 15
-; ILP32E-FPELIM-NEXT:    sw a2, 16(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 20(sp)
-; ILP32E-FPELIM-NEXT:    sw a0, 24(sp)
-; ILP32E-FPELIM-NEXT:    lui a0, 262236
-; ILP32E-FPELIM-NEXT:    addi a0, a0, 655
-; ILP32E-FPELIM-NEXT:    lui a1, 377487
-; ILP32E-FPELIM-NEXT:    addi a1, a1, 1475
-; ILP32E-FPELIM-NEXT:    li a2, 14
-; ILP32E-FPELIM-NEXT:    li a3, 4
-; ILP32E-FPELIM-NEXT:    sw a3, 0(sp)
-; ILP32E-FPELIM-NEXT:    sw a2, 4(sp)
-; ILP32E-FPELIM-NEXT:    sw a1, 8(sp)
-; ILP32E-FPELIM-NEXT:    sw a0, 12(sp)
-; ILP32E-FPELIM-NEXT:    lui a0, 262153
-; ILP32E-FPELIM-NEXT:    addi a6, a0, 491
-; ILP32E-FPELIM-NEXT:    lui a0, 545260
-; ILP32E-FPELIM-NEXT:    addi a7, a0, -1967
-; ILP32E-FPELIM-NEXT:    lui a0, 964690
-; ILP32E-FPELIM-NEXT:    addi t0, a0, -328
-; ILP32E-FPELIM-NEXT:    lui a0, 335544
-; ILP32E-FPELIM-NEXT:    addi t1, a0, 1311
-; ILP32E-FPELIM-NEXT:    lui a0, 688509
-; ILP32E-FPELIM-NEXT:    addi a5, a0, -2048
+; ILP32E-FPELIM-NEXT:    li a3, 17
+; ILP32E-FPELIM-NEXT:    li a4, 16
+; ILP32E-FPELIM-NEXT:    li a5, 15
+; ILP32E-FPELIM-NEXT:    lui a6, 262236
+; ILP32E-FPELIM-NEXT:    lui a7, 377487
+; ILP32E-FPELIM-NEXT:    li t0, 14
+; ILP32E-FPELIM-NEXT:    li t1, 4
+; ILP32E-FPELIM-NEXT:    lui t2, 262153
+; ILP32E-FPELIM-NEXT:    lui t3, 545260
+; ILP32E-FPELIM-NEXT:    lui t4, 964690
+; ILP32E-FPELIM-NEXT:    lui t5, 335544
+; ILP32E-FPELIM-NEXT:    lui t6, 688509
 ; ILP32E-FPELIM-NEXT:    li a0, 1
 ; ILP32E-FPELIM-NEXT:    li a1, 11
 ; ILP32E-FPELIM-NEXT:    addi a2, sp, 32
+; ILP32E-FPELIM-NEXT:    sw a5, 16(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 20(sp)
+; ILP32E-FPELIM-NEXT:    sw a3, 24(sp)
 ; ILP32E-FPELIM-NEXT:    li a3, 12
+; ILP32E-FPELIM-NEXT:    addi a4, a6, 655
+; ILP32E-FPELIM-NEXT:    addi a5, a7, 1475
+; ILP32E-FPELIM-NEXT:    sw t1, 0(sp)
+; ILP32E-FPELIM-NEXT:    sw t0, 4(sp)
+; ILP32E-FPELIM-NEXT:    sw a5, 8(sp)
+; ILP32E-FPELIM-NEXT:    sw a4, 12(sp)
 ; ILP32E-FPELIM-NEXT:    li a4, 13
+; ILP32E-FPELIM-NEXT:    addi a6, t2, 491
+; ILP32E-FPELIM-NEXT:    addi a7, t3, -1967
+; ILP32E-FPELIM-NEXT:    addi t0, t4, -328
+; ILP32E-FPELIM-NEXT:    addi t1, t5, 1311
+; ILP32E-FPELIM-NEXT:    addi a5, t6, -2048
 ; ILP32E-FPELIM-NEXT:    sw t1, 32(sp)
 ; ILP32E-FPELIM-NEXT:    sw t0, 36(sp)
 ; ILP32E-FPELIM-NEXT:    sw a7, 40(sp)
@@ -2459,37 +2459,37 @@ define void @va5_aligned_stack_caller() nounwind {
 ; ILP32E-WITHFP-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
 ; ILP32E-WITHFP-NEXT:    addi s0, sp, 64
 ; ILP32E-WITHFP-NEXT:    andi sp, sp, -16
-; ILP32E-WITHFP-NEXT:    li a0, 17
-; ILP32E-WITHFP-NEXT:    li a1, 16
-; ILP32E-WITHFP-NEXT:    li a2, 15
-; ILP32E-WITHFP-NEXT:    sw a2, 16(sp)
-; ILP32E-WITHFP-NEXT:    sw a1, 20(sp)
-; ILP32E-WITHFP-NEXT:    sw a0, 24(sp)
-; ILP32E-WITHFP-NEXT:    lui a0, 262236
-; ILP32E-WITHFP-NEXT:    addi a0, a0, 655
-; ILP32E-WITHFP-NEXT:    lui a1, 377487
-; ILP32E-WITHFP-NEXT:    addi a1, a1, 1475
-; ILP32E-WITHFP-NEXT:    li a2, 14
-; ILP32E-WITHFP-NEXT:    li a3, 4
-; ILP32E-WITHFP-NEXT:    sw a3, 0(sp)
-; ILP32E-WITHFP-NEXT:    sw a2, 4(sp)
-; ILP32E-WITHFP-NEXT:    sw a1, 8(sp)
-; ILP32E-WITHFP-NEXT:    sw a0, 12(sp)
-; ILP32E-WITHFP-NEXT:    lui a0, 262153
-; ILP32E-WITHFP-NEXT:    addi a6, a0, 491
-; ILP32E-WITHFP-NEXT:    lui a0, 545260
-; ILP32E-WITHFP-NEXT:    addi a7, a0, -1967
-; ILP32E-WITHFP-NEXT:    lui a0, 964690
-; ILP32E-WITHFP-NEXT:    addi t0, a0, -328
-; ILP32E-WITHFP-NEXT:    lui a0, 335544
-; ILP32E-WITHFP-NEXT:    addi t1, a0, 1311
-; ILP32E-WITHFP-NEXT:    lui a0, 688509
-; ILP32E-WITHFP-NEXT:    addi a5, a0, -2048
+; ILP32E-WITHFP-NEXT:    li a3, 17
+; ILP32E-WITHFP-NEXT:    li a4, 16
+; ILP32E-WITHFP-NEXT:    li a5, 15
+; ILP32E-WITHFP-NEXT:    lui a6, 262236
+; ILP32E-WITHFP-NEXT:    lui a7, 377487
+; ILP32E-WITHFP-NEXT:    li t0, 14
+; ILP32E-WITHFP-NEXT:    li t1, 4
+; ILP32E-WITHFP-NEXT:    lui t2, 262153
+; ILP32E-WITHFP-NEXT:    lui t3, 545260
+; ILP32E-WITHFP-NEXT:    lui t4, 964690
+; ILP32E-WITHFP-NEXT:    lui t5, 335544
+; ILP32E-WITHFP-NEXT:    lui t6, 688509
 ; ILP32E-WITHFP-NEXT:    li a0, 1
 ; ILP32E-WITHFP-NEXT:    li a1, 11
 ; ILP32E-WITHFP-NEXT:    addi a2, sp, 32
+; ILP32E-WITHFP-NEXT:    sw a5, 16(sp)
+; ILP32E-WITHFP-NEXT:    sw a4, 20(sp)
+; ILP32E-WITHFP-NEXT:    sw a3, 24(sp)
 ; ILP32E-WITHFP-NEXT:    li a3, 12
+; ILP32E-WITHFP-NEXT:    addi a4, a6, 655
+; ILP32E-WITHFP-NEXT:    addi a5, a7, 1475
+; ILP32E-WITHFP-NEXT:    sw t1, 0(sp)
+; ILP32E-WITHFP-NEXT:    sw t0, 4(sp)
+; ILP32E-WITHFP-NEXT:    sw a5, 8(sp)
+; ILP32E-WITHFP-NEXT:    sw a4, 12(sp)
 ; ILP32E-WITHFP-NEXT:    li a4, 13
+; ILP32E-WITHFP-NEXT:    addi a6, t2, 491
+; ILP32E-WITHFP-NEXT:    addi a7, t3, -1967
+; ILP32E-WITHFP-NEXT:    addi t0, t4, -328
+; ILP32E-WITHFP-NEXT:    addi t1, t5, 1311
+; ILP32E-WITHFP-NEXT:    addi a5, t6, -2048
 ; ILP32E-WITHFP-NEXT:    sw t1, 32(sp)
 ; ILP32E-WITHFP-NEXT:    sw t0, 36(sp)
 ; ILP32E-WITHFP-NEXT:    sw a7, 40(sp)
@@ -2508,21 +2508,21 @@ define void @va5_aligned_stack_caller() nounwind {
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    li t0, 17
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    li t1, 16
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    li t2, 15
-; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a0, %hi(.LCPI11_0)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    ld t3, %lo(.LCPI11_0)(a0)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a0, %hi(.LCPI11_1)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    ld a2, %lo(.LCPI11_1)(a0)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a0, %hi(.LCPI11_2)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    ld a3, %lo(.LCPI11_2)(a0)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a0, 2384
-; LP64-LP64F-LP64D-FPELIM-NEXT:    addiw a6, a0, 761
-; LP64-LP64F-LP64D-FPELIM-NEXT:    slli a6, a6, 11
+; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a2, %hi(.LCPI11_0)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a3, %hi(.LCPI11_1)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    lui a6, %hi(.LCPI11_2)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    lui t3, 2384
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    li a0, 1
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    li a1, 11
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    li a4, 12
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    li a5, 13
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    li a7, 14
-; LP64-LP64F-LP64D-FPELIM-NEXT:    sd t3, 0(sp)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    ld t4, %lo(.LCPI11_0)(a2)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    ld a2, %lo(.LCPI11_1)(a3)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    ld a3, %lo(.LCPI11_2)(a6)
+; LP64-LP64F-LP64D-FPELIM-NEXT:    addiw a6, t3, 761
+; LP64-LP64F-LP64D-FPELIM-NEXT:    slli a6, a6, 11
+; LP64-LP64F-LP64D-FPELIM-NEXT:    sd t4, 0(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd t2, 8(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd t1, 16(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd t0, 24(sp)
@@ -2540,21 +2540,21 @@ define void @va5_aligned_stack_caller() nounwind {
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    li t0, 17
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    li t1, 16
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    li t2, 15
-; LP64-LP64F-LP64D-WITHFP-NEXT:    lui a0, %hi(.LCPI11_0)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    ld t3, %lo(.LCPI11_0)(a0)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    lui a0, %hi(.LCPI11_1)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    ld a2, %lo(.LCPI11_1)(a0)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    lui a0, %hi(.LCPI11_2)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    ld a3, %lo(.LCPI11_2)(a0)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    lui a0, 2384
-; LP64-LP64F-LP64D-WITHFP-NEXT:    addiw a6, a0, 761
-; LP64-LP64F-LP64D-WITHFP-NEXT:    slli a6, a6, 11
+; LP64-LP64F-LP64D-WITHFP-NEXT:    lui a2, %hi(.LCPI11_0)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    lui a3, %hi(.LCPI11_1)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    lui a6, %hi(.LCPI11_2)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    lui t3, 2384
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    li a0, 1
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    li a1, 11
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    li a4, 12
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    li a5, 13
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    li a7, 14
-; LP64-LP64F-LP64D-WITHFP-NEXT:    sd t3, 0(sp)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    ld t4, %lo(.LCPI11_0)(a2)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    ld a2, %lo(.LCPI11_1)(a3)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    ld a3, %lo(.LCPI11_2)(a6)
+; LP64-LP64F-LP64D-WITHFP-NEXT:    addiw a6, t3, 761
+; LP64-LP64F-LP64D-WITHFP-NEXT:    slli a6, a6, 11
+; LP64-LP64F-LP64D-WITHFP-NEXT:    sd t4, 0(sp)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd t2, 8(sp)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd t1, 16(sp)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd t0, 24(sp)
@@ -2568,25 +2568,25 @@ define void @va5_aligned_stack_caller() nounwind {
 ; LP64E-FPELIM:       # %bb.0:
 ; LP64E-FPELIM-NEXT:    addi sp, sp, -56
 ; LP64E-FPELIM-NEXT:    sd ra, 48(sp) # 8-byte Folded Spill
-; LP64E-FPELIM-NEXT:    li a0, 17
-; LP64E-FPELIM-NEXT:    li a1, 16
-; LP64E-FPELIM-NEXT:    sd a1, 32(sp)
-; LP64E-FPELIM-NEXT:    sd a0, 40(sp)
+; LP64E-FPELIM-NEXT:    li a2, 17
+; LP64E-FPELIM-NEXT:    li a3, 16
 ; LP64E-FPELIM-NEXT:    li a6, 15
-; LP64E-FPELIM-NEXT:    lui a0, %hi(.LCPI11_0)
-; LP64E-FPELIM-NEXT:    ld a7, %lo(.LCPI11_0)(a0)
+; LP64E-FPELIM-NEXT:    lui a7, %hi(.LCPI11_0)
 ; LP64E-FPELIM-NEXT:    li t0, 14
-; LP64E-FPELIM-NEXT:    lui a0, 2384
-; LP64E-FPELIM-NEXT:    addiw a0, a0, 761
-; LP64E-FPELIM-NEXT:    slli t1, a0, 11
-; LP64E-FPELIM-NEXT:    lui a0, %hi(.LCPI11_1)
-; LP64E-FPELIM-NEXT:    ld a2, %lo(.LCPI11_1)(a0)
-; LP64E-FPELIM-NEXT:    lui a0, %hi(.LCPI11_2)
-; LP64E-FPELIM-NEXT:    ld a3, %lo(.LCPI11_2)(a0)
+; LP64E-FPELIM-NEXT:    lui t1, 2384
+; LP64E-FPELIM-NEXT:    lui t2, %hi(.LCPI11_1)
+; LP64E-FPELIM-NEXT:    lui t3, %hi(.LCPI11_2)
 ; LP64E-FPELIM-NEXT:    li a0, 1
 ; LP64E-FPELIM-NEXT:    li a1, 11
 ; LP64E-FPELIM-NEXT:    li a4, 12
+; LP64E-FPELIM-NEXT:    sd a3, 32(sp)
+; LP64E-FPELIM-NEXT:    sd a2, 40(sp)
 ; LP64E-FPELIM-NEXT:    li a5, 13
+; LP64E-FPELIM-NEXT:    ld a7, %lo(.LCPI11_0)(a7)
+; LP64E-FPELIM-NEXT:    addiw t1, t1, 761
+; LP64E-FPELIM-NEXT:    ld a2, %lo(.LCPI11_1)(t2)
+; LP64E-FPELIM-NEXT:    ld a3, %lo(.LCPI11_2)(t3)
+; LP64E-FPELIM-NEXT:    slli t1, t1, 11
 ; LP64E-FPELIM-NEXT:    sd t1, 0(sp)
 ; LP64E-FPELIM-NEXT:    sd t0, 8(sp)
 ; LP64E-FPELIM-NEXT:    sd a7, 16(sp)
@@ -2602,25 +2602,25 @@ define void @va5_aligned_stack_caller() nounwind {
 ; LP64E-WITHFP-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
 ; LP64E-WITHFP-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
 ; LP64E-WITHFP-NEXT:    addi s0, sp, 64
-; LP64E-WITHFP-NEXT:    li a0, 17
-; LP64E-WITHFP-NEXT:    li a1, 16
-; LP64E-WITHFP-NEXT:    sd a1, 32(sp)
-; LP64E-WITHFP-NEXT:    sd a0, 40(sp)
+; LP64E-WITHFP-NEXT:    li a2, 17
+; LP64E-WITHFP-NEXT:    li a3, 16
 ; LP64E-WITHFP-NEXT:    li a6, 15
-; LP64E-WITHFP-NEXT:    lui a0, %hi(.LCPI11_0)
-; LP64E-WITHFP-NEXT:    ld a7, %lo(.LCPI11_0)(a0)
+; LP64E-WITHFP-NEXT:    lui a7, %hi(.LCPI11_0)
 ; LP64E-WITHFP-NEXT:    li t0, 14
-; LP64E-WITHFP-NEXT:    lui a0, 2384
-; LP64E-WITHFP-NEXT:    addiw a0, a0, 761
-; LP64E-WITHFP-NEXT:    slli t1, a0, 11
-; LP64E-WITHFP-NEXT:    lui a0, %hi(.LCPI11_1)
-; LP64E-WITHFP-NEXT:    ld a2, %lo(.LCPI11_1)(a0)
-; LP64E-WITHFP-NEXT:    lui a0, %hi(.LCPI11_2)
-; LP64E-WITHFP-NEXT:    ld a3, %lo(.LCPI11_2)(a0)
+; LP64E-WITHFP-NEXT:    lui t1, 2384
+; LP64E-WITHFP-NEXT:    lui t2, %hi(.LCPI11_1)
+; LP64E-WITHFP-NEXT:    lui t3, %hi(.LCPI11_2)
 ; LP64E-WITHFP-NEXT:    li a0, 1
 ; LP64E-WITHFP-NEXT:    li a1, 11
 ; LP64E-WITHFP-NEXT:    li a4, 12
+; LP64E-WITHFP-NEXT:    sd a3, 32(sp)
+; LP64E-WITHFP-NEXT:    sd a2, 40(sp)
 ; LP64E-WITHFP-NEXT:    li a5, 13
+; LP64E-WITHFP-NEXT:    ld a7, %lo(.LCPI11_0)(a7)
+; LP64E-WITHFP-NEXT:    addiw t1, t1, 761
+; LP64E-WITHFP-NEXT:    ld a2, %lo(.LCPI11_1)(t2)
+; LP64E-WITHFP-NEXT:    ld a3, %lo(.LCPI11_2)(t3)
+; LP64E-WITHFP-NEXT:    slli t1, t1, 11
 ; LP64E-WITHFP-NEXT:    sd t1, 0(sp)
 ; LP64E-WITHFP-NEXT:    sd t0, 8(sp)
 ; LP64E-WITHFP-NEXT:    sd a7, 16(sp)

diff  --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
index cae59c79aaaa8a..437b7e557718cc 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -5,18 +5,18 @@
 define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_4bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lb a0, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    srlw a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 16
 ; RV64I-NEXT:    srli a3, a0, 24
@@ -30,25 +30,25 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-LABEL: lshr_4bytes:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    lbu a4, 2(a0)
+; RV32I-NEXT:    lbu a5, 3(a0)
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 1(a1)
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    slli a5, a5, 24
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    slli a3, a3, 16
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a0, a4, a0
 ; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -69,18 +69,18 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_4bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lb a0, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    sllw a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 16
 ; RV64I-NEXT:    srli a3, a0, 24
@@ -94,25 +94,25 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-LABEL: shl_4bytes:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    lbu a4, 2(a0)
+; RV32I-NEXT:    lbu a5, 3(a0)
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 1(a1)
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    slli a5, a5, 24
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    slli a3, a3, 16
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a0, a4, a0
 ; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -133,18 +133,18 @@ define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_4bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lb a0, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    slli a1, a1, 3
 ; RV64I-NEXT:    sraw a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 16
 ; RV64I-NEXT:    srli a3, a0, 24
@@ -158,25 +158,25 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-LABEL: ashr_4bytes:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    lbu a4, 2(a0)
+; RV32I-NEXT:    lbu a5, 3(a0)
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 1(a1)
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    slli a5, a5, 24
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    slli a3, a3, 16
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a0, a4, a0
 ; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    slli a1, a1, 3
 ; RV32I-NEXT:    sra a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 16
@@ -198,95 +198,95 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 4(a1)
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 6(a1)
+; RV64I-NEXT:    lbu t2, 7(a1)
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a4
-; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a5, 5(a1)
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 6(a1)
-; RV64I-NEXT:    lbu a6, 7(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    slli a3, a3, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    lbu a4, 2(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t1, 1(a1)
+; RV64I-NEXT:    or t0, t2, t0
+; RV64I-NEXT:    lbu t2, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or a7, t1, a7
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    or a1, a1, a5
+; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a4, t0, a6
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a3, a3, 35
-; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    slli a4, a4, 35
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a1, a4, a1
 ; RV64I-NEXT:    srl a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 48
 ; RV64I-NEXT:    srli a3, a0, 56
 ; RV64I-NEXT:    srli a4, a0, 32
 ; RV64I-NEXT:    srli a5, a0, 40
+; RV64I-NEXT:    srli a6, a0, 16
+; RV64I-NEXT:    srli a7, a0, 24
+; RV64I-NEXT:    srli t0, a0, 8
 ; RV64I-NEXT:    sb a4, 4(a2)
 ; RV64I-NEXT:    sb a5, 5(a2)
 ; RV64I-NEXT:    sb a1, 6(a2)
 ; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    srli a3, a0, 24
-; RV64I-NEXT:    srli a4, a0, 8
 ; RV64I-NEXT:    sb a0, 0(a2)
-; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    sb a1, 2(a2)
-; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    sb t0, 1(a2)
+; RV64I-NEXT:    sb a6, 2(a2)
+; RV64I-NEXT:    sb a7, 3(a2)
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_8bytes:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    lbu a4, 4(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 7(a0)
+; RV32I-NEXT:    lbu a4, 6(a0)
+; RV32I-NEXT:    lbu a5, 7(a0)
+; RV32I-NEXT:    lbu a6, 4(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
 ; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    slli a5, a5, 24
+; RV32I-NEXT:    or a3, a3, a6
+; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    slli a5, a1, 3
-; RV32I-NEXT:    addi a4, a5, -32
-; RV32I-NEXT:    srl a1, a3, a5
-; RV32I-NEXT:    bltz a4, .LBB3_2
+; RV32I-NEXT:    or a5, a4, a3
+; RV32I-NEXT:    or a4, a1, a6
+; RV32I-NEXT:    slli a4, a4, 3
+; RV32I-NEXT:    addi a3, a4, -32
+; RV32I-NEXT:    srl a1, a5, a4
+; RV32I-NEXT:    bltz a3, .LBB3_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    j .LBB3_3
@@ -297,32 +297,32 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a0, 3(a0)
 ; RV32I-NEXT:    slli a6, a6, 8
 ; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    slli a5, a5, 1
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    not a7, a4
 ; RV32I-NEXT:    or a0, a0, a6
-; RV32I-NEXT:    srl a0, a0, a5
-; RV32I-NEXT:    slli a3, a3, 1
-; RV32I-NEXT:    not a5, a5
-; RV32I-NEXT:    sll a3, a3, a5
-; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    srl a0, a0, a4
+; RV32I-NEXT:    sll a4, a5, a7
+; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:  .LBB3_3:
-; RV32I-NEXT:    srai a4, a4, 31
-; RV32I-NEXT:    and a1, a4, a1
+; RV32I-NEXT:    srai a3, a3, 31
+; RV32I-NEXT:    srli a4, a0, 16
+; RV32I-NEXT:    srli a5, a0, 24
+; RV32I-NEXT:    and a1, a3, a1
 ; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    srli a4, a1, 24
-; RV32I-NEXT:    srli a5, a1, 8
+; RV32I-NEXT:    srli a6, a1, 24
+; RV32I-NEXT:    srli a7, a1, 8
 ; RV32I-NEXT:    sb a1, 4(a2)
-; RV32I-NEXT:    sb a5, 5(a2)
+; RV32I-NEXT:    sb a7, 5(a2)
 ; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    sb a6, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    sb a1, 2(a2)
-; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    sb a4, 2(a2)
+; RV32I-NEXT:    sb a5, 3(a2)
 ; RV32I-NEXT:    ret
   %src = load i64, ptr %src.ptr, align 1
   %byteOff = load i64, ptr %byteOff.ptr, align 1
@@ -334,95 +334,95 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 4(a1)
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 6(a1)
+; RV64I-NEXT:    lbu t2, 7(a1)
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a4
-; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a5, 5(a1)
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 6(a1)
-; RV64I-NEXT:    lbu a6, 7(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    slli a3, a3, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    lbu a4, 2(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t1, 1(a1)
+; RV64I-NEXT:    or t0, t2, t0
+; RV64I-NEXT:    lbu t2, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or a7, t1, a7
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    or a1, a1, a5
+; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a4, t0, a6
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a3, a3, 35
-; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    slli a4, a4, 35
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a1, a4, a1
 ; RV64I-NEXT:    sll a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 48
 ; RV64I-NEXT:    srli a3, a0, 56
 ; RV64I-NEXT:    srli a4, a0, 32
 ; RV64I-NEXT:    srli a5, a0, 40
+; RV64I-NEXT:    srli a6, a0, 16
+; RV64I-NEXT:    srli a7, a0, 24
+; RV64I-NEXT:    srli t0, a0, 8
 ; RV64I-NEXT:    sb a4, 4(a2)
 ; RV64I-NEXT:    sb a5, 5(a2)
 ; RV64I-NEXT:    sb a1, 6(a2)
 ; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    srli a3, a0, 24
-; RV64I-NEXT:    srli a4, a0, 8
 ; RV64I-NEXT:    sb a0, 0(a2)
-; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    sb a1, 2(a2)
-; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    sb t0, 1(a2)
+; RV64I-NEXT:    sb a6, 2(a2)
+; RV64I-NEXT:    sb a7, 3(a2)
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_8bytes:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a4, 2(a0)
+; RV32I-NEXT:    lbu a5, 3(a0)
+; RV32I-NEXT:    lbu a6, 0(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
 ; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    slli a5, a5, 24
+; RV32I-NEXT:    or a3, a3, a6
+; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    slli a5, a1, 3
-; RV32I-NEXT:    addi a4, a5, -32
-; RV32I-NEXT:    sll a1, a3, a5
-; RV32I-NEXT:    bltz a4, .LBB4_2
+; RV32I-NEXT:    or a5, a4, a3
+; RV32I-NEXT:    or a4, a1, a6
+; RV32I-NEXT:    slli a4, a4, 3
+; RV32I-NEXT:    addi a3, a4, -32
+; RV32I-NEXT:    sll a1, a5, a4
+; RV32I-NEXT:    bltz a3, .LBB4_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    j .LBB4_3
@@ -433,32 +433,32 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a0, 7(a0)
 ; RV32I-NEXT:    slli a6, a6, 8
 ; RV32I-NEXT:    or a6, a6, a7
+; RV32I-NEXT:    srli a5, a5, 1
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    not a7, a4
 ; RV32I-NEXT:    or a0, a0, a6
-; RV32I-NEXT:    sll a0, a0, a5
-; RV32I-NEXT:    srli a3, a3, 1
-; RV32I-NEXT:    not a5, a5
-; RV32I-NEXT:    srl a3, a3, a5
-; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    sll a0, a0, a4
+; RV32I-NEXT:    srl a4, a5, a7
+; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:  .LBB4_3:
-; RV32I-NEXT:    srai a4, a4, 31
-; RV32I-NEXT:    and a1, a4, a1
+; RV32I-NEXT:    srai a3, a3, 31
+; RV32I-NEXT:    srli a4, a0, 16
+; RV32I-NEXT:    srli a5, a0, 24
+; RV32I-NEXT:    and a1, a3, a1
 ; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    srli a4, a1, 24
-; RV32I-NEXT:    srli a5, a1, 8
+; RV32I-NEXT:    srli a6, a1, 24
+; RV32I-NEXT:    srli a7, a1, 8
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a5, 1(a2)
+; RV32I-NEXT:    sb a7, 1(a2)
 ; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    sb a4, 3(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    sb a6, 3(a2)
+; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    sb a1, 6(a2)
-; RV32I-NEXT:    sb a3, 7(a2)
+; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    sb a4, 6(a2)
+; RV32I-NEXT:    sb a5, 7(a2)
 ; RV32I-NEXT:    ret
   %src = load i64, ptr %src.ptr, align 1
   %byteOff = load i64, ptr %byteOff.ptr, align 1
@@ -470,67 +470,67 @@ define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 4(a1)
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 6(a1)
+; RV64I-NEXT:    lbu t2, 7(a1)
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a4
-; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    lbu a4, 4(a1)
-; RV64I-NEXT:    lbu a5, 5(a1)
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 6(a1)
-; RV64I-NEXT:    lbu a6, 7(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    slli a3, a3, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 1(a1)
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    lbu a4, 2(a1)
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t1, 1(a1)
+; RV64I-NEXT:    or t0, t2, t0
+; RV64I-NEXT:    lbu t2, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or a7, t1, a7
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    or a1, a1, a5
+; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a4, t0, a6
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a3, a3, 35
-; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    slli a4, a4, 35
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a1, a4, a1
 ; RV64I-NEXT:    sra a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 48
 ; RV64I-NEXT:    srli a3, a0, 56
 ; RV64I-NEXT:    srli a4, a0, 32
 ; RV64I-NEXT:    srli a5, a0, 40
+; RV64I-NEXT:    srli a6, a0, 16
+; RV64I-NEXT:    srli a7, a0, 24
+; RV64I-NEXT:    srli t0, a0, 8
 ; RV64I-NEXT:    sb a4, 4(a2)
 ; RV64I-NEXT:    sb a5, 5(a2)
 ; RV64I-NEXT:    sb a1, 6(a2)
 ; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    srli a3, a0, 24
-; RV64I-NEXT:    srli a4, a0, 8
 ; RV64I-NEXT:    sb a0, 0(a2)
-; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    sb a1, 2(a2)
-; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    sb t0, 1(a2)
+; RV64I-NEXT:    sb a6, 2(a2)
+; RV64I-NEXT:    sb a7, 3(a2)
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_8bytes:
@@ -540,61 +540,61 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a5, 6(a0)
 ; RV32I-NEXT:    lbu a6, 7(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    lbu a7, 0(a1)
+; RV32I-NEXT:    lbu t0, 1(a1)
 ; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a4, a6, 24
-; RV32I-NEXT:    or a5, a4, a5
-; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 1(a1)
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    or a5, a1, a6
-; RV32I-NEXT:    slli a5, a5, 3
-; RV32I-NEXT:    addi a6, a5, -32
-; RV32I-NEXT:    sra a1, a3, a5
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    slli a4, a5, 16
+; RV32I-NEXT:    slli a5, a6, 24
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    or a4, a4, a3
+; RV32I-NEXT:    or a3, a1, a7
+; RV32I-NEXT:    slli a3, a3, 3
+; RV32I-NEXT:    addi a6, a3, -32
+; RV32I-NEXT:    sra a1, a4, a3
 ; RV32I-NEXT:    bltz a6, .LBB5_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    srai a4, a4, 31
+; RV32I-NEXT:    srai a5, a5, 31
 ; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    j .LBB5_3
 ; RV32I-NEXT:  .LBB5_2:
-; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 1(a0)
 ; RV32I-NEXT:    lbu a6, 0(a0)
 ; RV32I-NEXT:    lbu a7, 2(a0)
 ; RV32I-NEXT:    lbu a0, 3(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a6
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
+; RV32I-NEXT:    slli a4, a4, 1
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a7
-; RV32I-NEXT:    or a0, a0, a4
-; RV32I-NEXT:    srl a0, a0, a5
-; RV32I-NEXT:    slli a3, a3, 1
-; RV32I-NEXT:    not a4, a5
-; RV32I-NEXT:    sll a3, a3, a4
+; RV32I-NEXT:    not a6, a3
+; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:    srl a0, a0, a3
+; RV32I-NEXT:    sll a3, a4, a6
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:  .LBB5_3:
 ; RV32I-NEXT:    srli a3, a1, 16
 ; RV32I-NEXT:    srli a4, a1, 24
 ; RV32I-NEXT:    srli a5, a1, 8
+; RV32I-NEXT:    srli a6, a0, 16
+; RV32I-NEXT:    srli a7, a0, 24
 ; RV32I-NEXT:    sb a1, 4(a2)
 ; RV32I-NEXT:    sb a5, 5(a2)
 ; RV32I-NEXT:    sb a3, 6(a2)
 ; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    sb a1, 2(a2)
-; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    sb a6, 2(a2)
+; RV32I-NEXT:    sb a7, 3(a2)
 ; RV32I-NEXT:    ret
   %src = load i64, ptr %src.ptr, align 1
   %byteOff = load i64, ptr %byteOff.ptr, align 1
@@ -607,232 +607,232 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 13(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a1)
-; RV64I-NEXT:    lbu a7, 7(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 0(a1)
-; RV64I-NEXT:    lbu a7, 1(a1)
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    lbu a5, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 4(a1)
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 6(a1)
+; RV64I-NEXT:    lbu t3, 7(a1)
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t2, 1(a1)
+; RV64I-NEXT:    or t0, t3, t0
+; RV64I-NEXT:    lbu t3, 2(a1)
+; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t3
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, t1, a5
+; RV64I-NEXT:    or a5, t0, a6
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a4, a4, 35
-; RV64I-NEXT:    or a5, a4, a1
-; RV64I-NEXT:    addi a4, a5, -64
-; RV64I-NEXT:    srl a1, a3, a5
-; RV64I-NEXT:    bltz a4, .LBB6_2
+; RV64I-NEXT:    slli a6, a5, 35
+; RV64I-NEXT:    or a5, a4, a3
+; RV64I-NEXT:    or a4, a6, a1
+; RV64I-NEXT:    addi a3, a4, -64
+; RV64I-NEXT:    srl a1, a5, a4
+; RV64I-NEXT:    bltz a3, .LBB6_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB6_3
 ; RV64I-NEXT:  .LBB6_2:
 ; RV64I-NEXT:    lbu a6, 1(a0)
-; RV64I-NEXT:    lbu a7, 0(a0)
-; RV64I-NEXT:    lbu t0, 2(a0)
-; RV64I-NEXT:    lbu t1, 3(a0)
+; RV64I-NEXT:    lbu a7, 2(a0)
+; RV64I-NEXT:    lbu t0, 3(a0)
+; RV64I-NEXT:    lbu t1, 0(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu t1, 5(a0)
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
 ; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, a6, t1
+; RV64I-NEXT:    lbu t1, 4(a0)
+; RV64I-NEXT:    lbu t2, 5(a0)
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    lbu t0, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    not a7, a4
+; RV64I-NEXT:    slli a5, a5, 1
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    srl a0, a0, a5
-; RV64I-NEXT:    not a5, a5
-; RV64I-NEXT:    slli a3, a3, 1
-; RV64I-NEXT:    sll a3, a3, a5
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srl a0, a0, a4
+; RV64I-NEXT:    sll a4, a5, a7
+; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:  .LBB6_3:
-; RV64I-NEXT:    srai a4, a4, 63
-; RV64I-NEXT:    and a1, a4, a1
+; RV64I-NEXT:    srai a3, a3, 63
+; RV64I-NEXT:    srli a4, a0, 56
+; RV64I-NEXT:    srli a5, a0, 48
+; RV64I-NEXT:    srli a6, a0, 40
+; RV64I-NEXT:    srli a7, a0, 32
+; RV64I-NEXT:    srli t0, a0, 24
+; RV64I-NEXT:    srli t1, a0, 16
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    sb a7, 4(a2)
+; RV64I-NEXT:    sb a6, 5(a2)
+; RV64I-NEXT:    sb a5, 6(a2)
+; RV64I-NEXT:    sb a4, 7(a2)
 ; RV64I-NEXT:    srli a3, a1, 56
 ; RV64I-NEXT:    srli a4, a1, 48
 ; RV64I-NEXT:    srli a5, a1, 40
 ; RV64I-NEXT:    srli a6, a1, 32
+; RV64I-NEXT:    srli a7, a1, 24
+; RV64I-NEXT:    srli t2, a1, 16
 ; RV64I-NEXT:    sb a6, 12(a2)
 ; RV64I-NEXT:    sb a5, 13(a2)
 ; RV64I-NEXT:    sb a4, 14(a2)
 ; RV64I-NEXT:    sb a3, 15(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a5, a1, 8
+; RV64I-NEXT:    srli a3, a1, 8
 ; RV64I-NEXT:    sb a1, 8(a2)
-; RV64I-NEXT:    sb a5, 9(a2)
-; RV64I-NEXT:    sb a4, 10(a2)
-; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 4(a2)
-; RV64I-NEXT:    sb a4, 5(a2)
-; RV64I-NEXT:    sb a3, 6(a2)
-; RV64I-NEXT:    sb a1, 7(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    srli a3, a0, 16
-; RV64I-NEXT:    srli a4, a0, 8
+; RV64I-NEXT:    sb a3, 9(a2)
+; RV64I-NEXT:    sb t2, 10(a2)
+; RV64I-NEXT:    sb a7, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    sb a0, 0(a2)
-; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    sb a3, 2(a2)
-; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    sb a1, 1(a2)
+; RV64I-NEXT:    sb t1, 2(a2)
+; RV64I-NEXT:    sb t0, 3(a2)
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_16bytes:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 9(a0)
+; RV32I-NEXT:    lbu t3, 10(a0)
+; RV32I-NEXT:    lbu t4, 11(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, t2, t1
 ; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
+; RV32I-NEXT:    lbu a6, 12(a0)
+; RV32I-NEXT:    lbu t1, 13(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a6
-; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu t0, 1(a1)
-; RV32I-NEXT:    or a0, a0, a7
-; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    or a6, t1, a6
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu t1, 0(a1)
+; RV32I-NEXT:    lbu t2, 1(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a6, t0, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a7
-; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    sw zero, 16(sp)
 ; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    mv t2, sp
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, t0, a7
+; RV32I-NEXT:    or a5, t3, a5
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    or a1, a1, t1
 ; RV32I-NEXT:    sw a3, 0(sp)
 ; RV32I-NEXT:    sw a4, 4(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
 ; RV32I-NEXT:    sw a0, 12(sp)
-; RV32I-NEXT:    andi a0, a1, 12
-; RV32I-NEXT:    mv a3, sp
-; RV32I-NEXT:    add a0, a3, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    slli a1, a1, 3
-; RV32I-NEXT:    lw a5, 8(a0)
-; RV32I-NEXT:    lw a0, 12(a0)
-; RV32I-NEXT:    srl a6, a4, a1
-; RV32I-NEXT:    andi a7, a1, 24
-; RV32I-NEXT:    xori a7, a7, 31
-; RV32I-NEXT:    slli t0, a5, 1
-; RV32I-NEXT:    sll t0, t0, a7
-; RV32I-NEXT:    or t0, a6, t0
-; RV32I-NEXT:    srl a3, a3, a1
-; RV32I-NEXT:    slli a4, a4, 1
-; RV32I-NEXT:    sll a4, a4, a7
-; RV32I-NEXT:    or a4, a3, a4
-; RV32I-NEXT:    srl a5, a5, a1
-; RV32I-NEXT:    slli t1, a0, 1
-; RV32I-NEXT:    sll a7, t1, a7
-; RV32I-NEXT:    or a7, a5, a7
-; RV32I-NEXT:    srl a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    slli a0, a1, 3
+; RV32I-NEXT:    andi a1, a1, 12
+; RV32I-NEXT:    add a1, t2, a1
+; RV32I-NEXT:    andi a3, a0, 24
+; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a5, 4(a1)
+; RV32I-NEXT:    lw a6, 8(a1)
+; RV32I-NEXT:    xori a3, a3, 31
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    srl a7, a5, a0
+; RV32I-NEXT:    slli t0, a6, 1
+; RV32I-NEXT:    srl a4, a4, a0
+; RV32I-NEXT:    slli a5, a5, 1
+; RV32I-NEXT:    srl a6, a6, a0
+; RV32I-NEXT:    slli t1, a1, 1
+; RV32I-NEXT:    srl a0, a1, a0
+; RV32I-NEXT:    sll a1, t0, a3
+; RV32I-NEXT:    sll a5, a5, a3
+; RV32I-NEXT:    sll a3, t1, a3
+; RV32I-NEXT:    srli t0, a0, 16
 ; RV32I-NEXT:    srli t1, a0, 24
 ; RV32I-NEXT:    srli t2, a0, 8
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    or a5, a4, a5
+; RV32I-NEXT:    or a3, a6, a3
 ; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    sb t0, 14(a2)
 ; RV32I-NEXT:    sb t1, 15(a2)
-; RV32I-NEXT:    srli a0, a7, 16
-; RV32I-NEXT:    srli a1, a7, 24
-; RV32I-NEXT:    srli a7, a7, 8
-; RV32I-NEXT:    sb a5, 8(a2)
-; RV32I-NEXT:    sb a7, 9(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    srli t0, a3, 24
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    srli t1, a5, 16
+; RV32I-NEXT:    srli t2, a5, 24
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    srli t3, a1, 16
+; RV32I-NEXT:    srli t4, a1, 24
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a6, 8(a2)
+; RV32I-NEXT:    sb a3, 9(a2)
 ; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    sb a1, 11(a2)
-; RV32I-NEXT:    srli a0, a4, 16
-; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a3, 0(a2)
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    sb a0, 2(a2)
-; RV32I-NEXT:    sb a1, 3(a2)
-; RV32I-NEXT:    srli a0, t0, 16
-; RV32I-NEXT:    srli a1, t0, 24
-; RV32I-NEXT:    srli a3, t0, 8
-; RV32I-NEXT:    sb a6, 4(a2)
-; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    sb t0, 11(a2)
+; RV32I-NEXT:    sb a4, 0(a2)
+; RV32I-NEXT:    sb a5, 1(a2)
+; RV32I-NEXT:    sb t1, 2(a2)
+; RV32I-NEXT:    sb t2, 3(a2)
+; RV32I-NEXT:    sb a7, 4(a2)
+; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    sb t3, 6(a2)
+; RV32I-NEXT:    sb t4, 7(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -846,208 +846,208 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_16bytes_wordOff:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 13(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a1)
-; RV64I-NEXT:    lbu a7, 7(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 0(a1)
-; RV64I-NEXT:    lbu a7, 1(a1)
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    lbu a5, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 4(a1)
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 6(a1)
+; RV64I-NEXT:    lbu t3, 7(a1)
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t2, 1(a1)
+; RV64I-NEXT:    or t0, t3, t0
+; RV64I-NEXT:    lbu t3, 2(a1)
+; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t3
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, t1, a5
+; RV64I-NEXT:    or a5, t0, a6
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a1, a1, 5
-; RV64I-NEXT:    slli a4, a4, 37
-; RV64I-NEXT:    or a5, a4, a1
-; RV64I-NEXT:    addi a4, a5, -64
-; RV64I-NEXT:    srl a1, a3, a5
-; RV64I-NEXT:    bltz a4, .LBB7_2
+; RV64I-NEXT:    slli a6, a5, 37
+; RV64I-NEXT:    or a5, a4, a3
+; RV64I-NEXT:    or a4, a6, a1
+; RV64I-NEXT:    addi a3, a4, -64
+; RV64I-NEXT:    srl a1, a5, a4
+; RV64I-NEXT:    bltz a3, .LBB7_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB7_3
 ; RV64I-NEXT:  .LBB7_2:
 ; RV64I-NEXT:    lbu a6, 1(a0)
-; RV64I-NEXT:    lbu a7, 0(a0)
-; RV64I-NEXT:    lbu t0, 2(a0)
-; RV64I-NEXT:    lbu t1, 3(a0)
+; RV64I-NEXT:    lbu a7, 2(a0)
+; RV64I-NEXT:    lbu t0, 3(a0)
+; RV64I-NEXT:    lbu t1, 0(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu t1, 5(a0)
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
 ; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, a6, t1
+; RV64I-NEXT:    lbu t1, 4(a0)
+; RV64I-NEXT:    lbu t2, 5(a0)
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    lbu t0, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    not a7, a4
+; RV64I-NEXT:    slli a5, a5, 1
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    srl a0, a0, a5
-; RV64I-NEXT:    not a5, a5
-; RV64I-NEXT:    slli a3, a3, 1
-; RV64I-NEXT:    sll a3, a3, a5
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srl a0, a0, a4
+; RV64I-NEXT:    sll a4, a5, a7
+; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:  .LBB7_3:
-; RV64I-NEXT:    srai a4, a4, 63
-; RV64I-NEXT:    and a1, a4, a1
+; RV64I-NEXT:    srai a3, a3, 63
+; RV64I-NEXT:    srli a4, a0, 56
+; RV64I-NEXT:    srli a5, a0, 48
+; RV64I-NEXT:    srli a6, a0, 40
+; RV64I-NEXT:    srli a7, a0, 32
+; RV64I-NEXT:    srli t0, a0, 24
+; RV64I-NEXT:    srli t1, a0, 16
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    sb a7, 4(a2)
+; RV64I-NEXT:    sb a6, 5(a2)
+; RV64I-NEXT:    sb a5, 6(a2)
+; RV64I-NEXT:    sb a4, 7(a2)
 ; RV64I-NEXT:    srli a3, a1, 56
 ; RV64I-NEXT:    srli a4, a1, 48
 ; RV64I-NEXT:    srli a5, a1, 40
 ; RV64I-NEXT:    srli a6, a1, 32
+; RV64I-NEXT:    srli a7, a1, 24
+; RV64I-NEXT:    srli t2, a1, 16
 ; RV64I-NEXT:    sb a6, 12(a2)
 ; RV64I-NEXT:    sb a5, 13(a2)
 ; RV64I-NEXT:    sb a4, 14(a2)
 ; RV64I-NEXT:    sb a3, 15(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a5, a1, 8
+; RV64I-NEXT:    srli a3, a1, 8
 ; RV64I-NEXT:    sb a1, 8(a2)
-; RV64I-NEXT:    sb a5, 9(a2)
-; RV64I-NEXT:    sb a4, 10(a2)
-; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 4(a2)
-; RV64I-NEXT:    sb a4, 5(a2)
-; RV64I-NEXT:    sb a3, 6(a2)
-; RV64I-NEXT:    sb a1, 7(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    srli a3, a0, 16
-; RV64I-NEXT:    srli a4, a0, 8
+; RV64I-NEXT:    sb a3, 9(a2)
+; RV64I-NEXT:    sb t2, 10(a2)
+; RV64I-NEXT:    sb a7, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    sb a0, 0(a2)
-; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    sb a3, 2(a2)
-; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    sb a1, 1(a2)
+; RV64I-NEXT:    sb t1, 2(a2)
+; RV64I-NEXT:    sb t0, 3(a2)
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_16bytes_wordOff:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    lbu a6, 12(a0)
+; RV32I-NEXT:    lbu a7, 13(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a6
-; RV32I-NEXT:    or a0, a0, a7
 ; RV32I-NEXT:    lbu a1, 0(a1)
 ; RV32I-NEXT:    sw zero, 16(sp)
 ; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    mv t2, sp
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    or t4, t6, t5
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    andi a1, a1, 12
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, t1, a5
+; RV32I-NEXT:    or a5, t4, t3
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    add a1, t2, a1
 ; RV32I-NEXT:    sw a3, 0(sp)
 ; RV32I-NEXT:    sw a4, 4(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
 ; RV32I-NEXT:    sw a0, 12(sp)
-; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    andi a1, a1, 12
-; RV32I-NEXT:    mv a0, sp
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lw a1, 8(a0)
-; RV32I-NEXT:    lw a3, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a0, 12(a0)
-; RV32I-NEXT:    srli a5, a1, 16
-; RV32I-NEXT:    srli a6, a1, 24
-; RV32I-NEXT:    srli a7, a1, 8
-; RV32I-NEXT:    sb a1, 8(a2)
+; RV32I-NEXT:    lw a0, 8(a1)
+; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    srli a5, a0, 16
+; RV32I-NEXT:    srli a6, a0, 24
+; RV32I-NEXT:    srli a7, a0, 8
+; RV32I-NEXT:    srli t0, a1, 16
+; RV32I-NEXT:    srli t1, a1, 24
+; RV32I-NEXT:    srli t2, a1, 8
+; RV32I-NEXT:    srli t3, a4, 16
+; RV32I-NEXT:    srli t4, a4, 24
+; RV32I-NEXT:    srli t5, a4, 8
+; RV32I-NEXT:    srli t6, a3, 16
+; RV32I-NEXT:    sb a0, 8(a2)
 ; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    sb a5, 10(a2)
 ; RV32I-NEXT:    sb a6, 11(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    srli a5, a0, 24
-; RV32I-NEXT:    srli a6, a0, 8
-; RV32I-NEXT:    sb a0, 12(a2)
-; RV32I-NEXT:    sb a6, 13(a2)
-; RV32I-NEXT:    sb a1, 14(a2)
-; RV32I-NEXT:    sb a5, 15(a2)
-; RV32I-NEXT:    srli a0, a4, 16
-; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    srli a5, a4, 8
+; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    sb a1, 12(a2)
+; RV32I-NEXT:    sb t2, 13(a2)
+; RV32I-NEXT:    sb t0, 14(a2)
+; RV32I-NEXT:    sb t1, 15(a2)
+; RV32I-NEXT:    srli a1, a3, 8
 ; RV32I-NEXT:    sb a4, 0(a2)
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    sb a0, 2(a2)
-; RV32I-NEXT:    sb a1, 3(a2)
-; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    srli a4, a3, 8
+; RV32I-NEXT:    sb t5, 1(a2)
+; RV32I-NEXT:    sb t3, 2(a2)
+; RV32I-NEXT:    sb t4, 3(a2)
 ; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    sb t6, 6(a2)
+; RV32I-NEXT:    sb a0, 7(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -1061,232 +1061,232 @@ define void @lshr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a1)
-; RV64I-NEXT:    lbu a7, 7(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 0(a1)
-; RV64I-NEXT:    lbu a7, 1(a1)
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    lbu a5, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 4(a1)
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 6(a1)
+; RV64I-NEXT:    lbu t3, 7(a1)
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t2, 1(a1)
+; RV64I-NEXT:    or t0, t3, t0
+; RV64I-NEXT:    lbu t3, 2(a1)
+; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t3
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, t1, a5
+; RV64I-NEXT:    or a5, t0, a6
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a4, a4, 35
-; RV64I-NEXT:    or a5, a4, a1
-; RV64I-NEXT:    addi a4, a5, -64
-; RV64I-NEXT:    sll a1, a3, a5
-; RV64I-NEXT:    bltz a4, .LBB8_2
+; RV64I-NEXT:    slli a6, a5, 35
+; RV64I-NEXT:    or a5, a4, a3
+; RV64I-NEXT:    or a4, a6, a1
+; RV64I-NEXT:    addi a3, a4, -64
+; RV64I-NEXT:    sll a1, a5, a4
+; RV64I-NEXT:    bltz a3, .LBB8_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB8_3
 ; RV64I-NEXT:  .LBB8_2:
 ; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    lbu a7, 8(a0)
-; RV64I-NEXT:    lbu t0, 10(a0)
-; RV64I-NEXT:    lbu t1, 11(a0)
+; RV64I-NEXT:    lbu a7, 10(a0)
+; RV64I-NEXT:    lbu t0, 11(a0)
+; RV64I-NEXT:    lbu t1, 8(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    lbu t0, 12(a0)
-; RV64I-NEXT:    lbu t1, 13(a0)
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 14(a0)
-; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
 ; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, a6, t1
+; RV64I-NEXT:    lbu t1, 12(a0)
+; RV64I-NEXT:    lbu t2, 13(a0)
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    lbu t0, 14(a0)
+; RV64I-NEXT:    lbu a0, 15(a0)
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    not a7, a4
+; RV64I-NEXT:    srli a5, a5, 1
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    sll a0, a0, a5
-; RV64I-NEXT:    not a5, a5
-; RV64I-NEXT:    srli a3, a3, 1
-; RV64I-NEXT:    srl a3, a3, a5
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    sll a0, a0, a4
+; RV64I-NEXT:    srl a4, a5, a7
+; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:  .LBB8_3:
-; RV64I-NEXT:    srai a4, a4, 63
-; RV64I-NEXT:    and a1, a4, a1
+; RV64I-NEXT:    srai a3, a3, 63
+; RV64I-NEXT:    srli a4, a0, 56
+; RV64I-NEXT:    srli a5, a0, 48
+; RV64I-NEXT:    srli a6, a0, 40
+; RV64I-NEXT:    srli a7, a0, 32
+; RV64I-NEXT:    srli t0, a0, 24
+; RV64I-NEXT:    srli t1, a0, 16
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    sb a7, 12(a2)
+; RV64I-NEXT:    sb a6, 13(a2)
+; RV64I-NEXT:    sb a5, 14(a2)
+; RV64I-NEXT:    sb a4, 15(a2)
 ; RV64I-NEXT:    srli a3, a1, 56
 ; RV64I-NEXT:    srli a4, a1, 48
 ; RV64I-NEXT:    srli a5, a1, 40
 ; RV64I-NEXT:    srli a6, a1, 32
+; RV64I-NEXT:    srli a7, a1, 24
+; RV64I-NEXT:    srli t2, a1, 16
 ; RV64I-NEXT:    sb a6, 4(a2)
 ; RV64I-NEXT:    sb a5, 5(a2)
 ; RV64I-NEXT:    sb a4, 6(a2)
 ; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a5, a1, 8
+; RV64I-NEXT:    srli a3, a1, 8
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb a5, 1(a2)
-; RV64I-NEXT:    sb a4, 2(a2)
-; RV64I-NEXT:    sb a3, 3(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 12(a2)
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    sb a3, 14(a2)
-; RV64I-NEXT:    sb a1, 15(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    srli a3, a0, 16
-; RV64I-NEXT:    srli a4, a0, 8
+; RV64I-NEXT:    sb a3, 1(a2)
+; RV64I-NEXT:    sb t2, 2(a2)
+; RV64I-NEXT:    sb a7, 3(a2)
+; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    sb a4, 9(a2)
-; RV64I-NEXT:    sb a3, 10(a2)
-; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    sb t1, 10(a2)
+; RV64I-NEXT:    sb t0, 11(a2)
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_16bytes:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 9(a0)
+; RV32I-NEXT:    lbu t3, 10(a0)
+; RV32I-NEXT:    lbu t4, 11(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, t2, t1
 ; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
+; RV32I-NEXT:    lbu a6, 12(a0)
+; RV32I-NEXT:    lbu t1, 13(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a6
-; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu t0, 1(a1)
-; RV32I-NEXT:    or a0, a0, a7
-; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    or a6, t1, a6
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu t1, 0(a1)
+; RV32I-NEXT:    lbu t2, 1(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a6, t0, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a7
-; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    sw zero, 0(sp)
 ; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    addi t2, sp, 16
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, t0, a7
+; RV32I-NEXT:    or a5, t3, a5
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    or a1, a1, t1
 ; RV32I-NEXT:    sw a3, 16(sp)
 ; RV32I-NEXT:    sw a4, 20(sp)
 ; RV32I-NEXT:    sw a5, 24(sp)
 ; RV32I-NEXT:    sw a0, 28(sp)
-; RV32I-NEXT:    andi a0, a1, 12
-; RV32I-NEXT:    addi a3, sp, 16
-; RV32I-NEXT:    sub a3, a3, a0
-; RV32I-NEXT:    lw a0, 0(a3)
-; RV32I-NEXT:    lw a4, 4(a3)
-; RV32I-NEXT:    slli a1, a1, 3
-; RV32I-NEXT:    lw a5, 8(a3)
-; RV32I-NEXT:    lw a3, 12(a3)
-; RV32I-NEXT:    sll a6, a4, a1
-; RV32I-NEXT:    andi a7, a1, 24
-; RV32I-NEXT:    xori a7, a7, 31
-; RV32I-NEXT:    srli t0, a0, 1
-; RV32I-NEXT:    srl t0, t0, a7
-; RV32I-NEXT:    or t0, a6, t0
-; RV32I-NEXT:    sll a3, a3, a1
-; RV32I-NEXT:    srli t1, a5, 1
-; RV32I-NEXT:    srl t1, t1, a7
-; RV32I-NEXT:    or t1, a3, t1
-; RV32I-NEXT:    sll a5, a5, a1
-; RV32I-NEXT:    srli a4, a4, 1
-; RV32I-NEXT:    srl a4, a4, a7
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    sll a0, a0, a1
-; RV32I-NEXT:    srli a5, a5, 24
-; RV32I-NEXT:    srli a1, a4, 16
-; RV32I-NEXT:    srli a7, a4, 8
-; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb a7, 9(a2)
-; RV32I-NEXT:    sb a1, 10(a2)
-; RV32I-NEXT:    sb a5, 11(a2)
-; RV32I-NEXT:    srli a3, a3, 24
-; RV32I-NEXT:    srli a1, t1, 16
-; RV32I-NEXT:    srli a4, t1, 8
-; RV32I-NEXT:    sb t1, 12(a2)
-; RV32I-NEXT:    sb a4, 13(a2)
-; RV32I-NEXT:    sb a1, 14(a2)
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    slli a0, a1, 3
+; RV32I-NEXT:    andi a1, a1, 12
+; RV32I-NEXT:    sub a1, t2, a1
+; RV32I-NEXT:    andi a3, a0, 24
+; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a5, 4(a1)
+; RV32I-NEXT:    lw a6, 8(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    xori a3, a3, 31
+; RV32I-NEXT:    sll a7, a5, a0
+; RV32I-NEXT:    srli t0, a4, 1
+; RV32I-NEXT:    sll a1, a1, a0
+; RV32I-NEXT:    srli t1, a6, 1
+; RV32I-NEXT:    sll a6, a6, a0
+; RV32I-NEXT:    srli a5, a5, 1
+; RV32I-NEXT:    sll a0, a4, a0
+; RV32I-NEXT:    srl a4, t0, a3
+; RV32I-NEXT:    srl t0, t1, a3
+; RV32I-NEXT:    srl a3, a5, a3
+; RV32I-NEXT:    srli a5, a6, 24
+; RV32I-NEXT:    srli t1, a1, 24
+; RV32I-NEXT:    srli t2, a0, 16
+; RV32I-NEXT:    srli t3, a0, 24
+; RV32I-NEXT:    srli t4, a0, 8
+; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    srli a7, a7, 24
+; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a3, a6, a3
 ; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    sb a1, 2(a2)
-; RV32I-NEXT:    sb a3, 3(a2)
-; RV32I-NEXT:    srli a0, a6, 24
-; RV32I-NEXT:    srli a1, t0, 16
-; RV32I-NEXT:    srli a3, t0, 8
-; RV32I-NEXT:    sb t0, 4(a2)
-; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    sb a1, 6(a2)
-; RV32I-NEXT:    sb a0, 7(a2)
+; RV32I-NEXT:    sb t4, 1(a2)
+; RV32I-NEXT:    sb t2, 2(a2)
+; RV32I-NEXT:    sb t3, 3(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    srli t0, a1, 16
+; RV32I-NEXT:    srli t2, a1, 8
+; RV32I-NEXT:    srli t3, a4, 16
+; RV32I-NEXT:    srli t4, a4, 8
+; RV32I-NEXT:    sb a3, 8(a2)
+; RV32I-NEXT:    sb a6, 9(a2)
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    sb a5, 11(a2)
+; RV32I-NEXT:    sb a1, 12(a2)
+; RV32I-NEXT:    sb t2, 13(a2)
+; RV32I-NEXT:    sb t0, 14(a2)
+; RV32I-NEXT:    sb t1, 15(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    sb t4, 5(a2)
+; RV32I-NEXT:    sb t3, 6(a2)
+; RV32I-NEXT:    sb a7, 7(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -1300,208 +1300,208 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_16bytes_wordOff:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 5(a1)
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a1)
-; RV64I-NEXT:    lbu a7, 7(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 0(a1)
-; RV64I-NEXT:    lbu a7, 1(a1)
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    lbu a5, 2(a1)
-; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 4(a1)
+; RV64I-NEXT:    lbu a7, 5(a1)
+; RV64I-NEXT:    lbu t0, 6(a1)
+; RV64I-NEXT:    lbu t3, 7(a1)
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    lbu a7, 0(a1)
+; RV64I-NEXT:    lbu t2, 1(a1)
+; RV64I-NEXT:    or t0, t3, t0
+; RV64I-NEXT:    lbu t3, 2(a1)
+; RV64I-NEXT:    lbu a1, 3(a1)
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t3
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, t1, a5
+; RV64I-NEXT:    or a5, t0, a6
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a1, a1, 5
-; RV64I-NEXT:    slli a4, a4, 37
-; RV64I-NEXT:    or a5, a4, a1
-; RV64I-NEXT:    addi a4, a5, -64
-; RV64I-NEXT:    sll a1, a3, a5
-; RV64I-NEXT:    bltz a4, .LBB9_2
+; RV64I-NEXT:    slli a6, a5, 37
+; RV64I-NEXT:    or a5, a4, a3
+; RV64I-NEXT:    or a4, a6, a1
+; RV64I-NEXT:    addi a3, a4, -64
+; RV64I-NEXT:    sll a1, a5, a4
+; RV64I-NEXT:    bltz a3, .LBB9_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB9_3
 ; RV64I-NEXT:  .LBB9_2:
 ; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    lbu a7, 8(a0)
-; RV64I-NEXT:    lbu t0, 10(a0)
-; RV64I-NEXT:    lbu t1, 11(a0)
+; RV64I-NEXT:    lbu a7, 10(a0)
+; RV64I-NEXT:    lbu t0, 11(a0)
+; RV64I-NEXT:    lbu t1, 8(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    lbu t0, 12(a0)
-; RV64I-NEXT:    lbu t1, 13(a0)
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 14(a0)
-; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
 ; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, a6, t1
+; RV64I-NEXT:    lbu t1, 12(a0)
+; RV64I-NEXT:    lbu t2, 13(a0)
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    lbu t0, 14(a0)
+; RV64I-NEXT:    lbu a0, 15(a0)
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    not a7, a4
+; RV64I-NEXT:    srli a5, a5, 1
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    sll a0, a0, a5
-; RV64I-NEXT:    not a5, a5
-; RV64I-NEXT:    srli a3, a3, 1
-; RV64I-NEXT:    srl a3, a3, a5
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    sll a0, a0, a4
+; RV64I-NEXT:    srl a4, a5, a7
+; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:  .LBB9_3:
-; RV64I-NEXT:    srai a4, a4, 63
-; RV64I-NEXT:    and a1, a4, a1
+; RV64I-NEXT:    srai a3, a3, 63
+; RV64I-NEXT:    srli a4, a0, 56
+; RV64I-NEXT:    srli a5, a0, 48
+; RV64I-NEXT:    srli a6, a0, 40
+; RV64I-NEXT:    srli a7, a0, 32
+; RV64I-NEXT:    srli t0, a0, 24
+; RV64I-NEXT:    srli t1, a0, 16
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    sb a7, 12(a2)
+; RV64I-NEXT:    sb a6, 13(a2)
+; RV64I-NEXT:    sb a5, 14(a2)
+; RV64I-NEXT:    sb a4, 15(a2)
 ; RV64I-NEXT:    srli a3, a1, 56
 ; RV64I-NEXT:    srli a4, a1, 48
 ; RV64I-NEXT:    srli a5, a1, 40
 ; RV64I-NEXT:    srli a6, a1, 32
+; RV64I-NEXT:    srli a7, a1, 24
+; RV64I-NEXT:    srli t2, a1, 16
 ; RV64I-NEXT:    sb a6, 4(a2)
 ; RV64I-NEXT:    sb a5, 5(a2)
 ; RV64I-NEXT:    sb a4, 6(a2)
 ; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a5, a1, 8
+; RV64I-NEXT:    srli a3, a1, 8
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb a5, 1(a2)
-; RV64I-NEXT:    sb a4, 2(a2)
-; RV64I-NEXT:    sb a3, 3(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 12(a2)
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    sb a3, 14(a2)
-; RV64I-NEXT:    sb a1, 15(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    srli a3, a0, 16
-; RV64I-NEXT:    srli a4, a0, 8
+; RV64I-NEXT:    sb a3, 1(a2)
+; RV64I-NEXT:    sb t2, 2(a2)
+; RV64I-NEXT:    sb a7, 3(a2)
+; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    sb a4, 9(a2)
-; RV64I-NEXT:    sb a3, 10(a2)
-; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    sb t1, 10(a2)
+; RV64I-NEXT:    sb t0, 11(a2)
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_16bytes_wordOff:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    lbu a6, 12(a0)
+; RV32I-NEXT:    lbu a7, 13(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a6
-; RV32I-NEXT:    or a0, a0, a7
 ; RV32I-NEXT:    lbu a1, 0(a1)
 ; RV32I-NEXT:    sw zero, 0(sp)
 ; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    sw a3, 16(sp)
-; RV32I-NEXT:    sw a4, 20(sp)
-; RV32I-NEXT:    sw a5, 24(sp)
-; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    addi t2, sp, 16
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    or t4, t6, t5
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    or a0, a0, t0
 ; RV32I-NEXT:    andi a1, a1, 12
-; RV32I-NEXT:    addi a0, sp, 16
-; RV32I-NEXT:    sub a0, a0, a1
-; RV32I-NEXT:    lw a1, 8(a0)
-; RV32I-NEXT:    lw a3, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a0, 12(a0)
-; RV32I-NEXT:    srli a5, a1, 16
-; RV32I-NEXT:    srli a6, a1, 24
-; RV32I-NEXT:    srli a7, a1, 8
-; RV32I-NEXT:    sb a1, 8(a2)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, t1, a5
+; RV32I-NEXT:    or a5, t4, t3
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    sub a1, t2, a1
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a5, 24(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    lw a0, 8(a1)
+; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    srli a5, a0, 16
+; RV32I-NEXT:    srli a6, a0, 24
+; RV32I-NEXT:    srli a7, a0, 8
+; RV32I-NEXT:    srli t0, a1, 16
+; RV32I-NEXT:    srli t1, a1, 24
+; RV32I-NEXT:    srli t2, a1, 8
+; RV32I-NEXT:    srli t3, a4, 16
+; RV32I-NEXT:    srli t4, a4, 24
+; RV32I-NEXT:    srli t5, a4, 8
+; RV32I-NEXT:    srli t6, a3, 16
+; RV32I-NEXT:    sb a0, 8(a2)
 ; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    sb a5, 10(a2)
 ; RV32I-NEXT:    sb a6, 11(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    srli a5, a0, 24
-; RV32I-NEXT:    srli a6, a0, 8
-; RV32I-NEXT:    sb a0, 12(a2)
-; RV32I-NEXT:    sb a6, 13(a2)
-; RV32I-NEXT:    sb a1, 14(a2)
-; RV32I-NEXT:    sb a5, 15(a2)
-; RV32I-NEXT:    srli a0, a4, 16
-; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    srli a5, a4, 8
+; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    sb a1, 12(a2)
+; RV32I-NEXT:    sb t2, 13(a2)
+; RV32I-NEXT:    sb t0, 14(a2)
+; RV32I-NEXT:    sb t1, 15(a2)
+; RV32I-NEXT:    srli a1, a3, 8
 ; RV32I-NEXT:    sb a4, 0(a2)
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    sb a0, 2(a2)
-; RV32I-NEXT:    sb a1, 3(a2)
-; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    srli a4, a3, 8
+; RV32I-NEXT:    sb t5, 1(a2)
+; RV32I-NEXT:    sb t3, 2(a2)
+; RV32I-NEXT:    sb t4, 3(a2)
 ; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    sb t6, 6(a2)
+; RV32I-NEXT:    sb a0, 7(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -1516,233 +1516,233 @@ define void @shl_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 13(a0)
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
 ; RV64I-NEXT:    lbu a6, 4(a1)
 ; RV64I-NEXT:    lbu a7, 5(a1)
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    lbu a5, 6(a1)
-; RV64I-NEXT:    lbu t0, 7(a1)
+; RV64I-NEXT:    lbu t0, 6(a1)
+; RV64I-NEXT:    lbu t3, 7(a1)
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
 ; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 1(a1)
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    lbu t2, 1(a1)
+; RV64I-NEXT:    or t0, t3, t0
+; RV64I-NEXT:    lbu t3, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t3
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a5, t1, a5
+; RV64I-NEXT:    or a4, t0, a6
 ; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a6, a5, 32
 ; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    slli a5, a5, 35
-; RV64I-NEXT:    or a5, a5, a1
-; RV64I-NEXT:    addi a6, a5, -64
-; RV64I-NEXT:    sra a1, a3, a5
+; RV64I-NEXT:    slli a7, a4, 35
+; RV64I-NEXT:    or a4, a6, a3
+; RV64I-NEXT:    or a3, a7, a1
+; RV64I-NEXT:    addi a6, a3, -64
+; RV64I-NEXT:    sra a1, a4, a3
 ; RV64I-NEXT:    bltz a6, .LBB10_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    sraiw a3, a4, 31
+; RV64I-NEXT:    sraiw a3, a5, 31
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    mv a1, a3
 ; RV64I-NEXT:    j .LBB10_3
 ; RV64I-NEXT:  .LBB10_2:
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a6, 0(a0)
-; RV64I-NEXT:    lbu a7, 2(a0)
-; RV64I-NEXT:    lbu t0, 3(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    lbu t0, 5(a0)
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    lbu a5, 1(a0)
+; RV64I-NEXT:    lbu a6, 2(a0)
+; RV64I-NEXT:    lbu a7, 3(a0)
+; RV64I-NEXT:    lbu t0, 0(a0)
+; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a5, t0
+; RV64I-NEXT:    lbu t0, 4(a0)
+; RV64I-NEXT:    lbu t1, 5(a0)
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    not a6, a3
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a4
-; RV64I-NEXT:    srl a0, a0, a5
-; RV64I-NEXT:    not a4, a5
-; RV64I-NEXT:    slli a3, a3, 1
-; RV64I-NEXT:    sll a3, a3, a4
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    srl a0, a0, a3
+; RV64I-NEXT:    sll a3, a4, a6
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:  .LBB10_3:
 ; RV64I-NEXT:    srli a3, a1, 56
 ; RV64I-NEXT:    srli a4, a1, 48
 ; RV64I-NEXT:    srli a5, a1, 40
 ; RV64I-NEXT:    srli a6, a1, 32
+; RV64I-NEXT:    srli a7, a1, 24
+; RV64I-NEXT:    srli t0, a1, 16
+; RV64I-NEXT:    srli t1, a1, 8
+; RV64I-NEXT:    srli t2, a0, 56
+; RV64I-NEXT:    srli t3, a0, 48
+; RV64I-NEXT:    srli t4, a0, 40
+; RV64I-NEXT:    srli t5, a0, 32
 ; RV64I-NEXT:    sb a6, 12(a2)
 ; RV64I-NEXT:    sb a5, 13(a2)
 ; RV64I-NEXT:    sb a4, 14(a2)
 ; RV64I-NEXT:    sb a3, 15(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a5, a1, 8
+; RV64I-NEXT:    srli a3, a0, 24
 ; RV64I-NEXT:    sb a1, 8(a2)
-; RV64I-NEXT:    sb a5, 9(a2)
-; RV64I-NEXT:    sb a4, 10(a2)
-; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 4(a2)
-; RV64I-NEXT:    sb a4, 5(a2)
-; RV64I-NEXT:    sb a3, 6(a2)
-; RV64I-NEXT:    sb a1, 7(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    srli a3, a0, 16
+; RV64I-NEXT:    sb t1, 9(a2)
+; RV64I-NEXT:    sb t0, 10(a2)
+; RV64I-NEXT:    sb a7, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb t5, 4(a2)
+; RV64I-NEXT:    sb t4, 5(a2)
+; RV64I-NEXT:    sb t3, 6(a2)
+; RV64I-NEXT:    sb t2, 7(a2)
 ; RV64I-NEXT:    srli a4, a0, 8
 ; RV64I-NEXT:    sb a0, 0(a2)
 ; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    sb a3, 2(a2)
-; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    sb a3, 3(a2)
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_16bytes:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 8(a0)
+; RV32I-NEXT:    lbu t3, 9(a0)
+; RV32I-NEXT:    lbu t4, 10(a0)
+; RV32I-NEXT:    lbu t5, 11(a0)
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    lbu t0, 12(a0)
+; RV32I-NEXT:    lbu t1, 13(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a6, a0, a6
-; RV32I-NEXT:    lbu t0, 0(a1)
-; RV32I-NEXT:    lbu t1, 1(a1)
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    lbu a7, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
 ; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or a4, t3, a4
+; RV32I-NEXT:    or t3, t5, t4
+; RV32I-NEXT:    lbu t4, 0(a1)
+; RV32I-NEXT:    lbu t5, 1(a1)
 ; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    lbu t1, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a7
-; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    mv t1, sp
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t2, a0, t2
 ; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    or a5, a7, a6
+; RV32I-NEXT:    or a4, t3, a4
+; RV32I-NEXT:    or a6, t2, t0
+; RV32I-NEXT:    or a1, a1, t4
 ; RV32I-NEXT:    sw a0, 16(sp)
 ; RV32I-NEXT:    sw a0, 20(sp)
 ; RV32I-NEXT:    sw a0, 24(sp)
 ; RV32I-NEXT:    sw a0, 28(sp)
 ; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a5, 4(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
 ; RV32I-NEXT:    sw a6, 12(sp)
-; RV32I-NEXT:    andi a0, a1, 12
-; RV32I-NEXT:    mv a3, sp
-; RV32I-NEXT:    add a0, a3, a0
-; RV32I-NEXT:    lw a3, 0(a0)
-; RV32I-NEXT:    lw a4, 4(a0)
-; RV32I-NEXT:    slli a1, a1, 3
-; RV32I-NEXT:    lw a5, 8(a0)
-; RV32I-NEXT:    lw a0, 12(a0)
-; RV32I-NEXT:    srl a6, a4, a1
-; RV32I-NEXT:    andi a7, a1, 24
-; RV32I-NEXT:    xori a7, a7, 31
-; RV32I-NEXT:    slli t0, a5, 1
-; RV32I-NEXT:    sll t0, t0, a7
-; RV32I-NEXT:    or t0, a6, t0
-; RV32I-NEXT:    srl a3, a3, a1
-; RV32I-NEXT:    slli a4, a4, 1
-; RV32I-NEXT:    sll a4, a4, a7
-; RV32I-NEXT:    or a4, a3, a4
-; RV32I-NEXT:    srl a5, a5, a1
-; RV32I-NEXT:    slli t1, a0, 1
-; RV32I-NEXT:    sll a7, t1, a7
-; RV32I-NEXT:    or a7, a5, a7
-; RV32I-NEXT:    sra a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 16
+; RV32I-NEXT:    slli a0, a1, 3
+; RV32I-NEXT:    andi a1, a1, 12
+; RV32I-NEXT:    add a1, t1, a1
+; RV32I-NEXT:    andi a3, a0, 24
+; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a5, 4(a1)
+; RV32I-NEXT:    lw a6, 8(a1)
+; RV32I-NEXT:    xori a3, a3, 31
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    srl a7, a5, a0
+; RV32I-NEXT:    slli t0, a6, 1
+; RV32I-NEXT:    srl a4, a4, a0
+; RV32I-NEXT:    slli a5, a5, 1
+; RV32I-NEXT:    srl a6, a6, a0
+; RV32I-NEXT:    slli t1, a1, 1
+; RV32I-NEXT:    sra a0, a1, a0
+; RV32I-NEXT:    sll a1, t0, a3
+; RV32I-NEXT:    sll a5, a5, a3
+; RV32I-NEXT:    sll a3, t1, a3
+; RV32I-NEXT:    srli t0, a0, 16
 ; RV32I-NEXT:    srli t1, a0, 24
 ; RV32I-NEXT:    srli t2, a0, 8
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    or a5, a4, a5
+; RV32I-NEXT:    or a3, a6, a3
 ; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb t2, 13(a2)
-; RV32I-NEXT:    sb a1, 14(a2)
+; RV32I-NEXT:    sb t0, 14(a2)
 ; RV32I-NEXT:    sb t1, 15(a2)
-; RV32I-NEXT:    srli a0, a7, 16
-; RV32I-NEXT:    srli a1, a7, 24
-; RV32I-NEXT:    srli a7, a7, 8
-; RV32I-NEXT:    sb a5, 8(a2)
-; RV32I-NEXT:    sb a7, 9(a2)
+; RV32I-NEXT:    srli a0, a3, 16
+; RV32I-NEXT:    srli t0, a3, 24
+; RV32I-NEXT:    srli a3, a3, 8
+; RV32I-NEXT:    srli t1, a5, 16
+; RV32I-NEXT:    srli t2, a5, 24
+; RV32I-NEXT:    srli a5, a5, 8
+; RV32I-NEXT:    srli t3, a1, 16
+; RV32I-NEXT:    srli t4, a1, 24
+; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a6, 8(a2)
+; RV32I-NEXT:    sb a3, 9(a2)
 ; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    sb a1, 11(a2)
-; RV32I-NEXT:    srli a0, a4, 16
-; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a3, 0(a2)
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    sb a0, 2(a2)
-; RV32I-NEXT:    sb a1, 3(a2)
-; RV32I-NEXT:    srli a0, t0, 16
-; RV32I-NEXT:    srli a1, t0, 24
-; RV32I-NEXT:    srli a3, t0, 8
-; RV32I-NEXT:    sb a6, 4(a2)
-; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    sb t0, 11(a2)
+; RV32I-NEXT:    sb a4, 0(a2)
+; RV32I-NEXT:    sb a5, 1(a2)
+; RV32I-NEXT:    sb t1, 2(a2)
+; RV32I-NEXT:    sb t2, 3(a2)
+; RV32I-NEXT:    sb a7, 4(a2)
+; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    sb t3, 6(a2)
+; RV32I-NEXT:    sb t4, 7(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -1756,209 +1756,209 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_16bytes_wordOff:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 13(a0)
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
 ; RV64I-NEXT:    lbu a6, 4(a1)
 ; RV64I-NEXT:    lbu a7, 5(a1)
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    lbu a5, 6(a1)
-; RV64I-NEXT:    lbu t0, 7(a1)
+; RV64I-NEXT:    lbu t0, 6(a1)
+; RV64I-NEXT:    lbu t3, 7(a1)
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
 ; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 1(a1)
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    lbu a6, 2(a1)
+; RV64I-NEXT:    lbu t2, 1(a1)
+; RV64I-NEXT:    or t0, t3, t0
+; RV64I-NEXT:    lbu t3, 2(a1)
 ; RV64I-NEXT:    lbu a1, 3(a1)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t3
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a5, t1, a5
+; RV64I-NEXT:    or a4, t0, a6
 ; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a6, a5, 32
 ; RV64I-NEXT:    slli a1, a1, 5
-; RV64I-NEXT:    slli a5, a5, 37
-; RV64I-NEXT:    or a5, a5, a1
-; RV64I-NEXT:    addi a6, a5, -64
-; RV64I-NEXT:    sra a1, a3, a5
+; RV64I-NEXT:    slli a7, a4, 37
+; RV64I-NEXT:    or a4, a6, a3
+; RV64I-NEXT:    or a3, a7, a1
+; RV64I-NEXT:    addi a6, a3, -64
+; RV64I-NEXT:    sra a1, a4, a3
 ; RV64I-NEXT:    bltz a6, .LBB11_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    sraiw a3, a4, 31
+; RV64I-NEXT:    sraiw a3, a5, 31
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    mv a1, a3
 ; RV64I-NEXT:    j .LBB11_3
 ; RV64I-NEXT:  .LBB11_2:
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a6, 0(a0)
-; RV64I-NEXT:    lbu a7, 2(a0)
-; RV64I-NEXT:    lbu t0, 3(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    lbu t0, 5(a0)
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    lbu a5, 1(a0)
+; RV64I-NEXT:    lbu a6, 2(a0)
+; RV64I-NEXT:    lbu a7, 3(a0)
+; RV64I-NEXT:    lbu t0, 0(a0)
+; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a5, t0
+; RV64I-NEXT:    lbu t0, 4(a0)
+; RV64I-NEXT:    lbu t1, 5(a0)
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    not a6, a3
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a4
-; RV64I-NEXT:    srl a0, a0, a5
-; RV64I-NEXT:    not a4, a5
-; RV64I-NEXT:    slli a3, a3, 1
-; RV64I-NEXT:    sll a3, a3, a4
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    srl a0, a0, a3
+; RV64I-NEXT:    sll a3, a4, a6
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:  .LBB11_3:
 ; RV64I-NEXT:    srli a3, a1, 56
 ; RV64I-NEXT:    srli a4, a1, 48
 ; RV64I-NEXT:    srli a5, a1, 40
 ; RV64I-NEXT:    srli a6, a1, 32
+; RV64I-NEXT:    srli a7, a1, 24
+; RV64I-NEXT:    srli t0, a1, 16
+; RV64I-NEXT:    srli t1, a1, 8
+; RV64I-NEXT:    srli t2, a0, 56
+; RV64I-NEXT:    srli t3, a0, 48
+; RV64I-NEXT:    srli t4, a0, 40
+; RV64I-NEXT:    srli t5, a0, 32
 ; RV64I-NEXT:    sb a6, 12(a2)
 ; RV64I-NEXT:    sb a5, 13(a2)
 ; RV64I-NEXT:    sb a4, 14(a2)
 ; RV64I-NEXT:    sb a3, 15(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a5, a1, 8
+; RV64I-NEXT:    srli a3, a0, 24
 ; RV64I-NEXT:    sb a1, 8(a2)
-; RV64I-NEXT:    sb a5, 9(a2)
-; RV64I-NEXT:    sb a4, 10(a2)
-; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 4(a2)
-; RV64I-NEXT:    sb a4, 5(a2)
-; RV64I-NEXT:    sb a3, 6(a2)
-; RV64I-NEXT:    sb a1, 7(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    srli a3, a0, 16
+; RV64I-NEXT:    sb t1, 9(a2)
+; RV64I-NEXT:    sb t0, 10(a2)
+; RV64I-NEXT:    sb a7, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb t5, 4(a2)
+; RV64I-NEXT:    sb t4, 5(a2)
+; RV64I-NEXT:    sb t3, 6(a2)
+; RV64I-NEXT:    sb t2, 7(a2)
 ; RV64I-NEXT:    srli a4, a0, 8
 ; RV64I-NEXT:    sb a0, 0(a2)
 ; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    sb a3, 2(a2)
-; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    sb a3, 3(a2)
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_16bytes_wordOff:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
+; RV32I-NEXT:    slli t0, t0, 8
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    lbu a6, 12(a0)
+; RV32I-NEXT:    lbu a7, 13(a0)
+; RV32I-NEXT:    lbu t0, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a6, a0, a6
-; RV32I-NEXT:    or a6, a6, a7
 ; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    mv t2, sp
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    slli t0, t0, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    or t4, t6, t5
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    or a7, a0, t0
 ; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    andi a1, a1, 12
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, t1, a5
+; RV32I-NEXT:    or a5, t4, t3
+; RV32I-NEXT:    or a6, a7, a6
 ; RV32I-NEXT:    sw a0, 16(sp)
 ; RV32I-NEXT:    sw a0, 20(sp)
 ; RV32I-NEXT:    sw a0, 24(sp)
 ; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    add a1, t2, a1
 ; RV32I-NEXT:    sw a3, 0(sp)
 ; RV32I-NEXT:    sw a4, 4(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
 ; RV32I-NEXT:    sw a6, 12(sp)
-; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    andi a1, a1, 12
-; RV32I-NEXT:    mv a0, sp
-; RV32I-NEXT:    add a0, a0, a1
-; RV32I-NEXT:    lw a1, 8(a0)
-; RV32I-NEXT:    lw a3, 4(a0)
-; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a0, 12(a0)
-; RV32I-NEXT:    srli a5, a1, 16
-; RV32I-NEXT:    srli a6, a1, 24
-; RV32I-NEXT:    srli a7, a1, 8
-; RV32I-NEXT:    sb a1, 8(a2)
+; RV32I-NEXT:    lw a0, 8(a1)
+; RV32I-NEXT:    lw a3, 4(a1)
+; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a1, 12(a1)
+; RV32I-NEXT:    srli a5, a0, 16
+; RV32I-NEXT:    srli a6, a0, 24
+; RV32I-NEXT:    srli a7, a0, 8
+; RV32I-NEXT:    srli t0, a1, 16
+; RV32I-NEXT:    srli t1, a1, 24
+; RV32I-NEXT:    srli t2, a1, 8
+; RV32I-NEXT:    srli t3, a4, 16
+; RV32I-NEXT:    srli t4, a4, 24
+; RV32I-NEXT:    srli t5, a4, 8
+; RV32I-NEXT:    srli t6, a3, 16
+; RV32I-NEXT:    sb a0, 8(a2)
 ; RV32I-NEXT:    sb a7, 9(a2)
 ; RV32I-NEXT:    sb a5, 10(a2)
 ; RV32I-NEXT:    sb a6, 11(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    srli a5, a0, 24
-; RV32I-NEXT:    srli a6, a0, 8
-; RV32I-NEXT:    sb a0, 12(a2)
-; RV32I-NEXT:    sb a6, 13(a2)
-; RV32I-NEXT:    sb a1, 14(a2)
-; RV32I-NEXT:    sb a5, 15(a2)
-; RV32I-NEXT:    srli a0, a4, 16
-; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    srli a5, a4, 8
+; RV32I-NEXT:    srli a0, a3, 24
+; RV32I-NEXT:    sb a1, 12(a2)
+; RV32I-NEXT:    sb t2, 13(a2)
+; RV32I-NEXT:    sb t0, 14(a2)
+; RV32I-NEXT:    sb t1, 15(a2)
+; RV32I-NEXT:    srli a1, a3, 8
 ; RV32I-NEXT:    sb a4, 0(a2)
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    sb a0, 2(a2)
-; RV32I-NEXT:    sb a1, 3(a2)
-; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    srli a4, a3, 8
+; RV32I-NEXT:    sb t5, 1(a2)
+; RV32I-NEXT:    sb t3, 2(a2)
+; RV32I-NEXT:    sb t4, 3(a2)
 ; RV32I-NEXT:    sb a3, 4(a2)
-; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    sb t6, 6(a2)
+; RV32I-NEXT:    sb a0, 7(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -1972,428 +1972,472 @@ define void @ashr_16bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -64
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 10(a0)
-; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    lbu a5, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 17(a0)
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 18(a0)
-; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 21(a0)
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    lbu a6, 22(a0)
-; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 29(a0)
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    lbu a7, 30(a0)
-; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 1(a1)
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    lbu a6, 2(a1)
-; RV64I-NEXT:    lbu t1, 3(a1)
 ; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    lbu t1, 5(a1)
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    lbu a7, 6(a1)
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    slli s2, s2, 16
+; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t6, 24(a0)
+; RV64I-NEXT:    lbu s0, 25(a0)
+; RV64I-NEXT:    lbu s1, 26(a0)
+; RV64I-NEXT:    lbu s2, 27(a0)
+; RV64I-NEXT:    slli s5, s5, 8
+; RV64I-NEXT:    slli s6, s6, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or t3, s5, s4
+; RV64I-NEXT:    or t4, s7, s6
+; RV64I-NEXT:    or t5, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
+; RV64I-NEXT:    lbu s6, 31(a0)
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s11, s10
+; RV64I-NEXT:    or t6, s0, t6
+; RV64I-NEXT:    or s0, s2, s1
+; RV64I-NEXT:    or s1, s4, s3
+; RV64I-NEXT:    lbu s2, 0(a1)
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or s5, s6, s5
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    lbu s3, 4(a1)
+; RV64I-NEXT:    lbu s6, 5(a1)
+; RV64I-NEXT:    or s4, s7, s4
+; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
-; RV64I-NEXT:    or a1, a1, t0
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    slli s6, s6, 8
+; RV64I-NEXT:    or s3, s6, s3
 ; RV64I-NEXT:    sd zero, 32(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
 ; RV64I-NEXT:    sd zero, 48(sp)
 ; RV64I-NEXT:    sd zero, 56(sp)
+; RV64I-NEXT:    slli s7, s7, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, s7
+; RV64I-NEXT:    mv s6, sp
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    or t0, s0, t6
+; RV64I-NEXT:    or t1, s5, s1
+; RV64I-NEXT:    or t2, s4, s2
+; RV64I-NEXT:    or a1, a1, s3
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a5, t1, t0
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    sd a4, 8(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
-; RV64I-NEXT:    sd a0, 24(sp)
-; RV64I-NEXT:    andi a0, a1, 24
-; RV64I-NEXT:    mv a3, sp
-; RV64I-NEXT:    add a0, a3, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    ld a4, 8(a0)
-; RV64I-NEXT:    slli a5, a1, 3
-; RV64I-NEXT:    ld a6, 16(a0)
-; RV64I-NEXT:    ld a7, 24(a0)
-; RV64I-NEXT:    srl a0, a4, a5
-; RV64I-NEXT:    andi a1, a5, 56
-; RV64I-NEXT:    xori t0, a1, 63
-; RV64I-NEXT:    slli a1, a6, 1
-; RV64I-NEXT:    sll a1, a1, t0
-; RV64I-NEXT:    or a1, a0, a1
-; RV64I-NEXT:    srl a3, a3, a5
-; RV64I-NEXT:    slli a4, a4, 1
-; RV64I-NEXT:    sll a4, a4, t0
-; RV64I-NEXT:    or a4, a3, a4
-; RV64I-NEXT:    srl a6, a6, a5
-; RV64I-NEXT:    slli t1, a7, 1
-; RV64I-NEXT:    sll t0, t1, t0
-; RV64I-NEXT:    or t0, a6, t0
-; RV64I-NEXT:    srl a5, a7, a5
-; RV64I-NEXT:    srli a7, a5, 56
-; RV64I-NEXT:    srli t1, a5, 48
-; RV64I-NEXT:    srli t2, a5, 40
-; RV64I-NEXT:    srli t3, a5, 32
+; RV64I-NEXT:    sd a0, 16(sp)
+; RV64I-NEXT:    sd a5, 24(sp)
+; RV64I-NEXT:    slli a4, a1, 3
+; RV64I-NEXT:    andi a1, a1, 24
+; RV64I-NEXT:    add a1, s6, a1
+; RV64I-NEXT:    andi a0, a4, 56
+; RV64I-NEXT:    ld a3, 0(a1)
+; RV64I-NEXT:    ld a5, 8(a1)
+; RV64I-NEXT:    ld a6, 16(a1)
+; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    ld t0, 24(a1)
+; RV64I-NEXT:    srl a0, a5, a4
+; RV64I-NEXT:    slli t1, a6, 1
+; RV64I-NEXT:    srl a1, a3, a4
+; RV64I-NEXT:    slli a5, a5, 1
+; RV64I-NEXT:    srl a3, a6, a4
+; RV64I-NEXT:    slli a6, t0, 1
+; RV64I-NEXT:    srl t0, t0, a4
+; RV64I-NEXT:    sll a4, t1, a7
+; RV64I-NEXT:    sll a5, a5, a7
+; RV64I-NEXT:    sll a6, a6, a7
+; RV64I-NEXT:    srli a7, t0, 56
+; RV64I-NEXT:    srli t1, t0, 48
+; RV64I-NEXT:    srli t2, t0, 40
+; RV64I-NEXT:    srli t3, t0, 32
+; RV64I-NEXT:    srli t4, t0, 24
+; RV64I-NEXT:    srli t5, t0, 16
+; RV64I-NEXT:    srli t6, t0, 8
+; RV64I-NEXT:    or a4, a0, a4
+; RV64I-NEXT:    or a5, a1, a5
+; RV64I-NEXT:    or a6, a3, a6
 ; RV64I-NEXT:    sb t3, 28(a2)
 ; RV64I-NEXT:    sb t2, 29(a2)
 ; RV64I-NEXT:    sb t1, 30(a2)
 ; RV64I-NEXT:    sb a7, 31(a2)
-; RV64I-NEXT:    srli a7, a5, 24
-; RV64I-NEXT:    srli t1, a5, 16
-; RV64I-NEXT:    srli t2, a5, 8
-; RV64I-NEXT:    sb a5, 24(a2)
-; RV64I-NEXT:    sb t2, 25(a2)
-; RV64I-NEXT:    sb t1, 26(a2)
-; RV64I-NEXT:    sb a7, 27(a2)
-; RV64I-NEXT:    srli a5, t0, 56
-; RV64I-NEXT:    srli a7, t0, 48
-; RV64I-NEXT:    srli t1, t0, 40
-; RV64I-NEXT:    srli t2, t0, 32
+; RV64I-NEXT:    sb t0, 24(a2)
+; RV64I-NEXT:    sb t6, 25(a2)
+; RV64I-NEXT:    sb t5, 26(a2)
+; RV64I-NEXT:    sb t4, 27(a2)
+; RV64I-NEXT:    srli a7, a6, 56
+; RV64I-NEXT:    srli t0, a6, 48
+; RV64I-NEXT:    srli t1, a6, 40
+; RV64I-NEXT:    srli t2, a6, 32
+; RV64I-NEXT:    srli t3, a6, 24
+; RV64I-NEXT:    srli t4, a6, 16
+; RV64I-NEXT:    srli a6, a6, 8
+; RV64I-NEXT:    srli t5, a5, 56
+; RV64I-NEXT:    srli t6, a5, 48
+; RV64I-NEXT:    srli s0, a5, 40
+; RV64I-NEXT:    srli s1, a5, 32
+; RV64I-NEXT:    srli s2, a5, 24
+; RV64I-NEXT:    srli s3, a5, 16
+; RV64I-NEXT:    srli a5, a5, 8
+; RV64I-NEXT:    srli s4, a4, 56
+; RV64I-NEXT:    srli s5, a4, 48
+; RV64I-NEXT:    srli s6, a4, 40
 ; RV64I-NEXT:    sb t2, 20(a2)
 ; RV64I-NEXT:    sb t1, 21(a2)
-; RV64I-NEXT:    sb a7, 22(a2)
-; RV64I-NEXT:    sb a5, 23(a2)
-; RV64I-NEXT:    srli a5, t0, 24
-; RV64I-NEXT:    srli a7, t0, 16
-; RV64I-NEXT:    srli t0, t0, 8
-; RV64I-NEXT:    sb a6, 16(a2)
-; RV64I-NEXT:    sb t0, 17(a2)
-; RV64I-NEXT:    sb a7, 18(a2)
-; RV64I-NEXT:    sb a5, 19(a2)
-; RV64I-NEXT:    srli a5, a4, 56
-; RV64I-NEXT:    srli a6, a4, 48
-; RV64I-NEXT:    srli a7, a4, 40
-; RV64I-NEXT:    srli t0, a4, 32
-; RV64I-NEXT:    sb t0, 4(a2)
-; RV64I-NEXT:    sb a7, 5(a2)
-; RV64I-NEXT:    sb a6, 6(a2)
-; RV64I-NEXT:    sb a5, 7(a2)
-; RV64I-NEXT:    srli a5, a4, 24
+; RV64I-NEXT:    sb t0, 22(a2)
+; RV64I-NEXT:    sb a7, 23(a2)
+; RV64I-NEXT:    srli a7, a4, 32
+; RV64I-NEXT:    sb a3, 16(a2)
+; RV64I-NEXT:    sb a6, 17(a2)
+; RV64I-NEXT:    sb t4, 18(a2)
+; RV64I-NEXT:    sb t3, 19(a2)
+; RV64I-NEXT:    srli a3, a4, 24
+; RV64I-NEXT:    sb s1, 4(a2)
+; RV64I-NEXT:    sb s0, 5(a2)
+; RV64I-NEXT:    sb t6, 6(a2)
+; RV64I-NEXT:    sb t5, 7(a2)
 ; RV64I-NEXT:    srli a6, a4, 16
 ; RV64I-NEXT:    srli a4, a4, 8
-; RV64I-NEXT:    sb a3, 0(a2)
-; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    sb a6, 2(a2)
-; RV64I-NEXT:    sb a5, 3(a2)
-; RV64I-NEXT:    srli a3, a1, 56
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    srli a6, a1, 32
-; RV64I-NEXT:    sb a6, 12(a2)
-; RV64I-NEXT:    sb a5, 13(a2)
-; RV64I-NEXT:    sb a4, 14(a2)
-; RV64I-NEXT:    sb a3, 15(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 0(a2)
+; RV64I-NEXT:    sb a5, 1(a2)
+; RV64I-NEXT:    sb s3, 2(a2)
+; RV64I-NEXT:    sb s2, 3(a2)
+; RV64I-NEXT:    sb a7, 12(a2)
+; RV64I-NEXT:    sb s6, 13(a2)
+; RV64I-NEXT:    sb s5, 14(a2)
+; RV64I-NEXT:    sb s4, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    sb a4, 10(a2)
+; RV64I-NEXT:    sb a4, 9(a2)
+; RV64I-NEXT:    sb a6, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -80
-; RV32I-NEXT:    sw s0, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s1, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu t1, 4(a0)
+; RV32I-NEXT:    lbu t3, 5(a0)
+; RV32I-NEXT:    lbu t4, 6(a0)
+; RV32I-NEXT:    lbu s0, 7(a0)
+; RV32I-NEXT:    lbu t2, 8(a0)
+; RV32I-NEXT:    lbu s3, 9(a0)
+; RV32I-NEXT:    lbu s6, 10(a0)
+; RV32I-NEXT:    lbu s8, 11(a0)
+; RV32I-NEXT:    lbu s9, 12(a0)
+; RV32I-NEXT:    lbu s10, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s7, 15(a0)
+; RV32I-NEXT:    lbu s5, 16(a0)
+; RV32I-NEXT:    lbu s11, 17(a0)
+; RV32I-NEXT:    lbu ra, 18(a0)
+; RV32I-NEXT:    lbu a3, 19(a0)
+; RV32I-NEXT:    lbu t5, 20(a0)
+; RV32I-NEXT:    lbu t6, 21(a0)
+; RV32I-NEXT:    lbu a7, 22(a0)
+; RV32I-NEXT:    lbu t0, 23(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a6, t1, a6
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 17(a0)
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    lbu a7, 18(a0)
-; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a7, t2, a7
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    lbu t0, 22(a0)
-; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t0, t3, t0
-; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 25(a0)
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    lbu t1, 26(a0)
-; RV32I-NEXT:    lbu t4, 27(a0)
 ; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    or t2, t3, t2
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t1, t4, t1
-; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 29(a0)
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    lbu t2, 30(a0)
-; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t2
-; RV32I-NEXT:    lbu t2, 0(a1)
-; RV32I-NEXT:    lbu t4, 1(a1)
-; RV32I-NEXT:    or a0, a0, t3
-; RV32I-NEXT:    lbu t3, 2(a1)
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli s0, s0, 24
+; RV32I-NEXT:    or a4, a4, s1
+; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t3, t1
+; RV32I-NEXT:    or a6, s0, t4
+; RV32I-NEXT:    lbu t1, 24(a0)
+; RV32I-NEXT:    lbu s0, 25(a0)
+; RV32I-NEXT:    lbu s1, 26(a0)
+; RV32I-NEXT:    lbu s2, 27(a0)
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s8, s8, 24
+; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    or t2, s3, t2
+; RV32I-NEXT:    or t3, s8, s6
+; RV32I-NEXT:    or t4, s10, s9
+; RV32I-NEXT:    lbu s3, 28(a0)
+; RV32I-NEXT:    lbu s6, 29(a0)
+; RV32I-NEXT:    lbu s8, 30(a0)
+; RV32I-NEXT:    lbu s9, 31(a0)
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, s7, s4
+; RV32I-NEXT:    or s4, s11, s5
+; RV32I-NEXT:    or s5, a3, ra
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu s7, 1(a1)
+; RV32I-NEXT:    lbu s10, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or t2, t4, t2
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t3
-; RV32I-NEXT:    or a1, a1, t2
-; RV32I-NEXT:    sw zero, 48(sp)
-; RV32I-NEXT:    sw zero, 52(sp)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
-; RV32I-NEXT:    sw zero, 32(sp)
-; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 64(sp)
+; RV32I-NEXT:    sw zero, 68(sp)
 ; RV32I-NEXT:    sw zero, 40(sp)
 ; RV32I-NEXT:    sw zero, 44(sp)
-; RV32I-NEXT:    sw a7, 16(sp)
-; RV32I-NEXT:    sw t0, 20(sp)
-; RV32I-NEXT:    sw t1, 24(sp)
-; RV32I-NEXT:    sw a0, 28(sp)
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
-; RV32I-NEXT:    andi a0, a1, 28
-; RV32I-NEXT:    mv a3, sp
-; RV32I-NEXT:    add a6, a3, a0
-; RV32I-NEXT:    lw a3, 0(a6)
-; RV32I-NEXT:    lw a4, 4(a6)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    or t5, t6, t5
+; RV32I-NEXT:    addi t6, sp, 8
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, s0, t1
+; RV32I-NEXT:    or t1, s2, s1
+; RV32I-NEXT:    or s0, s6, s3
+; RV32I-NEXT:    or s1, s9, s8
+; RV32I-NEXT:    or a3, s7, a3
+; RV32I-NEXT:    or a1, a1, s10
+; RV32I-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, s2
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t3, t2
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    or a7, a7, t5
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    sw t2, 24(sp)
+; RV32I-NEXT:    sw a7, 28(sp)
+; RV32I-NEXT:    sw t0, 32(sp)
+; RV32I-NEXT:    sw s0, 36(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    sw a0, 20(sp)
 ; RV32I-NEXT:    slli t1, a1, 3
-; RV32I-NEXT:    lw a7, 8(a6)
-; RV32I-NEXT:    lw t0, 12(a6)
+; RV32I-NEXT:    andi a1, a1, 28
+; RV32I-NEXT:    add a1, t6, a1
+; RV32I-NEXT:    andi a0, t1, 24
+; RV32I-NEXT:    xori t0, a0, 31
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw a5, 8(a1)
+; RV32I-NEXT:    lw a6, 12(a1)
+; RV32I-NEXT:    lw a7, 16(a1)
+; RV32I-NEXT:    lw t2, 20(a1)
+; RV32I-NEXT:    lw t3, 24(a1)
+; RV32I-NEXT:    lw t4, 28(a1)
 ; RV32I-NEXT:    srl a0, a4, t1
-; RV32I-NEXT:    andi a1, t1, 24
-; RV32I-NEXT:    xori t2, a1, 31
-; RV32I-NEXT:    slli a1, a7, 1
-; RV32I-NEXT:    sll a1, a1, t2
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    srl a3, a3, t1
-; RV32I-NEXT:    slli a4, a4, 1
-; RV32I-NEXT:    sll a4, a4, t2
-; RV32I-NEXT:    or a4, a3, a4
-; RV32I-NEXT:    srl a5, t0, t1
-; RV32I-NEXT:    lw t3, 16(a6)
-; RV32I-NEXT:    lw t4, 20(a6)
-; RV32I-NEXT:    lw t5, 24(a6)
-; RV32I-NEXT:    lw t6, 28(a6)
-; RV32I-NEXT:    slli a6, t3, 1
-; RV32I-NEXT:    sll a6, a6, t2
-; RV32I-NEXT:    or a6, a5, a6
-; RV32I-NEXT:    srl a7, a7, t1
-; RV32I-NEXT:    slli t0, t0, 1
-; RV32I-NEXT:    sll t0, t0, t2
-; RV32I-NEXT:    or t0, a7, t0
-; RV32I-NEXT:    srl s0, t4, t1
-; RV32I-NEXT:    slli s1, t5, 1
-; RV32I-NEXT:    sll s1, s1, t2
-; RV32I-NEXT:    or s1, s0, s1
-; RV32I-NEXT:    srl t3, t3, t1
-; RV32I-NEXT:    slli t4, t4, 1
-; RV32I-NEXT:    sll t4, t4, t2
-; RV32I-NEXT:    or t4, t3, t4
-; RV32I-NEXT:    srl t5, t5, t1
-; RV32I-NEXT:    slli s2, t6, 1
-; RV32I-NEXT:    sll t2, s2, t2
-; RV32I-NEXT:    or t2, t5, t2
-; RV32I-NEXT:    srl t1, t6, t1
-; RV32I-NEXT:    srli t6, t1, 24
-; RV32I-NEXT:    srli s2, t1, 16
-; RV32I-NEXT:    srli s3, t1, 8
+; RV32I-NEXT:    slli t5, a5, 1
+; RV32I-NEXT:    srl a1, a3, t1
+; RV32I-NEXT:    slli t6, a4, 1
+; RV32I-NEXT:    srl a3, a6, t1
+; RV32I-NEXT:    slli s0, a7, 1
+; RV32I-NEXT:    srl a4, a5, t1
+; RV32I-NEXT:    slli s1, a6, 1
+; RV32I-NEXT:    srl a5, t2, t1
+; RV32I-NEXT:    slli s2, t3, 1
+; RV32I-NEXT:    srl a6, a7, t1
+; RV32I-NEXT:    slli t2, t2, 1
+; RV32I-NEXT:    srl a7, t3, t1
+; RV32I-NEXT:    slli t3, t4, 1
+; RV32I-NEXT:    srl t1, t4, t1
+; RV32I-NEXT:    sll t4, t5, t0
+; RV32I-NEXT:    sll t5, t6, t0
+; RV32I-NEXT:    sll t6, s0, t0
+; RV32I-NEXT:    sll s0, s1, t0
+; RV32I-NEXT:    sll s1, s2, t0
+; RV32I-NEXT:    sll t2, t2, t0
+; RV32I-NEXT:    sll t3, t3, t0
+; RV32I-NEXT:    srli s2, t1, 24
+; RV32I-NEXT:    srli s3, t1, 16
+; RV32I-NEXT:    srli s4, t1, 8
+; RV32I-NEXT:    or t0, a0, t4
+; RV32I-NEXT:    or t4, a1, t5
+; RV32I-NEXT:    or t5, a3, t6
+; RV32I-NEXT:    or s0, a4, s0
+; RV32I-NEXT:    or s1, a5, s1
+; RV32I-NEXT:    or t2, a6, t2
+; RV32I-NEXT:    or t3, a7, t3
 ; RV32I-NEXT:    sb t1, 28(a2)
-; RV32I-NEXT:    sb s3, 29(a2)
-; RV32I-NEXT:    sb s2, 30(a2)
-; RV32I-NEXT:    sb t6, 31(a2)
-; RV32I-NEXT:    srli t1, t2, 24
-; RV32I-NEXT:    srli t6, t2, 16
+; RV32I-NEXT:    sb s4, 29(a2)
+; RV32I-NEXT:    sb s3, 30(a2)
+; RV32I-NEXT:    sb s2, 31(a2)
+; RV32I-NEXT:    srli t1, t3, 24
+; RV32I-NEXT:    srli t6, t3, 16
+; RV32I-NEXT:    srli t3, t3, 8
+; RV32I-NEXT:    srli s2, t2, 24
+; RV32I-NEXT:    srli s3, t2, 16
 ; RV32I-NEXT:    srli t2, t2, 8
-; RV32I-NEXT:    sb t5, 24(a2)
-; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    srli s4, s1, 24
+; RV32I-NEXT:    srli s5, s1, 16
+; RV32I-NEXT:    srli s1, s1, 8
+; RV32I-NEXT:    srli s6, s0, 24
+; RV32I-NEXT:    srli s7, s0, 16
+; RV32I-NEXT:    srli s0, s0, 8
+; RV32I-NEXT:    srli s8, t5, 24
+; RV32I-NEXT:    srli s9, t5, 16
+; RV32I-NEXT:    srli t5, t5, 8
+; RV32I-NEXT:    srli s10, t4, 24
+; RV32I-NEXT:    srli s11, t4, 16
+; RV32I-NEXT:    srli t4, t4, 8
+; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    sb t3, 25(a2)
 ; RV32I-NEXT:    sb t6, 26(a2)
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, t4, 24
-; RV32I-NEXT:    srli t2, t4, 16
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb t3, 16(a2)
-; RV32I-NEXT:    sb t4, 17(a2)
-; RV32I-NEXT:    sb t2, 18(a2)
-; RV32I-NEXT:    sb t1, 19(a2)
-; RV32I-NEXT:    srli t1, s1, 24
-; RV32I-NEXT:    srli t2, s1, 16
-; RV32I-NEXT:    srli s1, s1, 8
-; RV32I-NEXT:    sb s0, 20(a2)
-; RV32I-NEXT:    sb s1, 21(a2)
-; RV32I-NEXT:    sb t2, 22(a2)
-; RV32I-NEXT:    sb t1, 23(a2)
-; RV32I-NEXT:    srli t1, t0, 24
-; RV32I-NEXT:    srli t2, t0, 16
+; RV32I-NEXT:    srli a7, t0, 24
+; RV32I-NEXT:    sb a6, 16(a2)
+; RV32I-NEXT:    sb t2, 17(a2)
+; RV32I-NEXT:    sb s3, 18(a2)
+; RV32I-NEXT:    sb s2, 19(a2)
+; RV32I-NEXT:    srli a6, t0, 16
 ; RV32I-NEXT:    srli t0, t0, 8
-; RV32I-NEXT:    sb a7, 8(a2)
-; RV32I-NEXT:    sb t0, 9(a2)
-; RV32I-NEXT:    sb t2, 10(a2)
-; RV32I-NEXT:    sb t1, 11(a2)
-; RV32I-NEXT:    srli a7, a6, 24
-; RV32I-NEXT:    srli t0, a6, 16
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a5, 12(a2)
-; RV32I-NEXT:    sb a6, 13(a2)
-; RV32I-NEXT:    sb t0, 14(a2)
-; RV32I-NEXT:    sb a7, 15(a2)
-; RV32I-NEXT:    srli a5, a4, 24
-; RV32I-NEXT:    srli a6, a4, 16
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a3, 0(a2)
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    sb a6, 2(a2)
-; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    srli a4, a1, 16
-; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a5, 20(a2)
+; RV32I-NEXT:    sb s1, 21(a2)
+; RV32I-NEXT:    sb s5, 22(a2)
+; RV32I-NEXT:    sb s4, 23(a2)
+; RV32I-NEXT:    sb a4, 8(a2)
+; RV32I-NEXT:    sb s0, 9(a2)
+; RV32I-NEXT:    sb s7, 10(a2)
+; RV32I-NEXT:    sb s6, 11(a2)
+; RV32I-NEXT:    sb a3, 12(a2)
+; RV32I-NEXT:    sb t5, 13(a2)
+; RV32I-NEXT:    sb s9, 14(a2)
+; RV32I-NEXT:    sb s8, 15(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb t4, 1(a2)
+; RV32I-NEXT:    sb s11, 2(a2)
+; RV32I-NEXT:    sb s10, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
-; RV32I-NEXT:    sb a4, 6(a2)
-; RV32I-NEXT:    sb a3, 7(a2)
-; RV32I-NEXT:    lw s0, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 80
+; RV32I-NEXT:    sb t0, 5(a2)
+; RV32I-NEXT:    sb a6, 6(a2)
+; RV32I-NEXT:    sb a7, 7(a2)
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -2406,381 +2450,431 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_32bytes_wordOff:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -64
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 10(a0)
-; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    lbu a5, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 17(a0)
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 18(a0)
-; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 21(a0)
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    lbu a6, 22(a0)
-; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 29(a0)
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    lbu a7, 30(a0)
-; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 1(a1)
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    lbu a6, 2(a1)
-; RV64I-NEXT:    lbu t1, 3(a1)
 ; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    lbu t1, 5(a1)
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    lbu a7, 6(a1)
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    slli s2, s2, 16
+; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t6, 24(a0)
+; RV64I-NEXT:    lbu s0, 25(a0)
+; RV64I-NEXT:    lbu s1, 26(a0)
+; RV64I-NEXT:    lbu s2, 27(a0)
+; RV64I-NEXT:    slli s5, s5, 8
+; RV64I-NEXT:    slli s6, s6, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or t3, s5, s4
+; RV64I-NEXT:    or t4, s7, s6
+; RV64I-NEXT:    or t5, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
+; RV64I-NEXT:    lbu s6, 31(a0)
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s11, s10
+; RV64I-NEXT:    or t6, s0, t6
+; RV64I-NEXT:    or s0, s2, s1
+; RV64I-NEXT:    or s1, s4, s3
+; RV64I-NEXT:    lbu s2, 0(a1)
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or s5, s6, s5
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    lbu s3, 4(a1)
+; RV64I-NEXT:    lbu s6, 5(a1)
+; RV64I-NEXT:    or s4, s7, s4
+; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
-; RV64I-NEXT:    or a1, a1, t0
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    slli s6, s6, 8
+; RV64I-NEXT:    or s3, s6, s3
 ; RV64I-NEXT:    sd zero, 32(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
 ; RV64I-NEXT:    sd zero, 48(sp)
 ; RV64I-NEXT:    sd zero, 56(sp)
+; RV64I-NEXT:    slli s7, s7, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, s7
+; RV64I-NEXT:    mv s6, sp
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    or t0, s0, t6
+; RV64I-NEXT:    or t1, s5, s1
+; RV64I-NEXT:    or t2, s4, s2
+; RV64I-NEXT:    or a1, a1, s3
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a5, t1, t0
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    sd a4, 8(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
-; RV64I-NEXT:    sd a0, 24(sp)
-; RV64I-NEXT:    slli a0, a1, 2
-; RV64I-NEXT:    andi a0, a0, 24
-; RV64I-NEXT:    mv a3, sp
-; RV64I-NEXT:    add a0, a3, a0
-; RV64I-NEXT:    ld a4, 0(a0)
-; RV64I-NEXT:    ld a5, 8(a0)
-; RV64I-NEXT:    slli a6, a1, 5
-; RV64I-NEXT:    ld a7, 16(a0)
-; RV64I-NEXT:    ld t0, 24(a0)
-; RV64I-NEXT:    srl a3, a5, a6
-; RV64I-NEXT:    andi a0, a6, 32
-; RV64I-NEXT:    xori t1, a0, 63
-; RV64I-NEXT:    slli a0, a7, 1
-; RV64I-NEXT:    sll a0, a0, t1
-; RV64I-NEXT:    or a0, a3, a0
-; RV64I-NEXT:    srl t2, a4, a6
+; RV64I-NEXT:    sd a0, 16(sp)
+; RV64I-NEXT:    sd a5, 24(sp)
+; RV64I-NEXT:    slli a3, a1, 5
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    andi a1, a1, 24
+; RV64I-NEXT:    andi a0, a3, 32
+; RV64I-NEXT:    add a1, s6, a1
+; RV64I-NEXT:    ld a4, 0(a1)
+; RV64I-NEXT:    ld a5, 8(a1)
+; RV64I-NEXT:    ld a6, 16(a1)
+; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    ld t0, 24(a1)
+; RV64I-NEXT:    srl a0, a5, a3
+; RV64I-NEXT:    slli t1, a6, 1
+; RV64I-NEXT:    srl a1, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 1
-; RV64I-NEXT:    sll a1, a5, t1
-; RV64I-NEXT:    or a1, t2, a1
-; RV64I-NEXT:    srl a5, a7, a6
-; RV64I-NEXT:    slli a4, t0, 1
-; RV64I-NEXT:    sll a4, a4, t1
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    srl a6, t0, a6
-; RV64I-NEXT:    srli a7, a5, 24
-; RV64I-NEXT:    srli t0, a5, 16
-; RV64I-NEXT:    srli t1, a5, 8
-; RV64I-NEXT:    sb a5, 16(a2)
-; RV64I-NEXT:    sb t1, 17(a2)
-; RV64I-NEXT:    sb t0, 18(a2)
+; RV64I-NEXT:    srl a4, a6, a3
+; RV64I-NEXT:    slli a6, t0, 1
+; RV64I-NEXT:    srl a3, t0, a3
+; RV64I-NEXT:    sll t0, t1, a7
+; RV64I-NEXT:    sll a5, a5, a7
+; RV64I-NEXT:    sll a6, a6, a7
+; RV64I-NEXT:    srli a7, a4, 24
+; RV64I-NEXT:    srli t1, a4, 16
+; RV64I-NEXT:    srli t2, a4, 8
+; RV64I-NEXT:    srli t3, a3, 56
+; RV64I-NEXT:    srli t4, a3, 48
+; RV64I-NEXT:    srli t5, a3, 40
+; RV64I-NEXT:    srli t6, a3, 32
+; RV64I-NEXT:    srli s0, a3, 24
+; RV64I-NEXT:    srli s1, a3, 16
+; RV64I-NEXT:    srli s2, a3, 8
+; RV64I-NEXT:    srli s3, a1, 24
+; RV64I-NEXT:    srli s4, a1, 16
+; RV64I-NEXT:    srli s5, a1, 8
+; RV64I-NEXT:    srli s6, a0, 24
+; RV64I-NEXT:    or a6, a4, a6
+; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    sb t2, 17(a2)
+; RV64I-NEXT:    sb t1, 18(a2)
 ; RV64I-NEXT:    sb a7, 19(a2)
-; RV64I-NEXT:    srli a5, a6, 56
-; RV64I-NEXT:    srli a7, a6, 48
-; RV64I-NEXT:    srli t0, a6, 40
-; RV64I-NEXT:    srli t1, a6, 32
-; RV64I-NEXT:    sb t1, 28(a2)
-; RV64I-NEXT:    sb t0, 29(a2)
-; RV64I-NEXT:    sb a7, 30(a2)
-; RV64I-NEXT:    sb a5, 31(a2)
-; RV64I-NEXT:    srli a5, a6, 24
-; RV64I-NEXT:    srli a7, a6, 16
-; RV64I-NEXT:    srli t0, a6, 8
-; RV64I-NEXT:    sb a6, 24(a2)
-; RV64I-NEXT:    sb t0, 25(a2)
-; RV64I-NEXT:    sb a7, 26(a2)
-; RV64I-NEXT:    sb a5, 27(a2)
-; RV64I-NEXT:    srli a5, t2, 24
-; RV64I-NEXT:    srli a6, t2, 16
-; RV64I-NEXT:    srli a7, t2, 8
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb a7, 1(a2)
-; RV64I-NEXT:    sb a6, 2(a2)
-; RV64I-NEXT:    sb a5, 3(a2)
-; RV64I-NEXT:    srli a5, a3, 24
-; RV64I-NEXT:    srli a6, a3, 16
-; RV64I-NEXT:    srli a7, a3, 8
-; RV64I-NEXT:    sb a3, 8(a2)
+; RV64I-NEXT:    srli a4, a0, 16
+; RV64I-NEXT:    sb t6, 28(a2)
+; RV64I-NEXT:    sb t5, 29(a2)
+; RV64I-NEXT:    sb t4, 30(a2)
+; RV64I-NEXT:    sb t3, 31(a2)
+; RV64I-NEXT:    srli a7, a0, 8
+; RV64I-NEXT:    or t0, a0, t0
+; RV64I-NEXT:    or a5, a1, a5
+; RV64I-NEXT:    sb a3, 24(a2)
+; RV64I-NEXT:    sb s2, 25(a2)
+; RV64I-NEXT:    sb s1, 26(a2)
+; RV64I-NEXT:    sb s0, 27(a2)
+; RV64I-NEXT:    sb a1, 0(a2)
+; RV64I-NEXT:    sb s5, 1(a2)
+; RV64I-NEXT:    sb s4, 2(a2)
+; RV64I-NEXT:    sb s3, 3(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    sb a7, 9(a2)
-; RV64I-NEXT:    sb a6, 10(a2)
-; RV64I-NEXT:    sb a5, 11(a2)
-; RV64I-NEXT:    srli a3, a4, 56
-; RV64I-NEXT:    srli a5, a4, 48
-; RV64I-NEXT:    srli a6, a4, 40
-; RV64I-NEXT:    srli a4, a4, 32
+; RV64I-NEXT:    sb a4, 10(a2)
+; RV64I-NEXT:    sb s6, 11(a2)
+; RV64I-NEXT:    srli a0, a6, 56
+; RV64I-NEXT:    srli a1, a6, 48
+; RV64I-NEXT:    srli a3, a6, 40
+; RV64I-NEXT:    srli a4, a6, 32
+; RV64I-NEXT:    srli a6, a5, 56
+; RV64I-NEXT:    srli a7, a5, 48
+; RV64I-NEXT:    srli t1, a5, 40
+; RV64I-NEXT:    srli a5, a5, 32
+; RV64I-NEXT:    srli t2, t0, 56
+; RV64I-NEXT:    srli t3, t0, 48
+; RV64I-NEXT:    srli t4, t0, 40
+; RV64I-NEXT:    srli t0, t0, 32
 ; RV64I-NEXT:    sb a4, 20(a2)
-; RV64I-NEXT:    sb a6, 21(a2)
-; RV64I-NEXT:    sb a5, 22(a2)
-; RV64I-NEXT:    sb a3, 23(a2)
-; RV64I-NEXT:    srli a3, a1, 56
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    srli a1, a1, 32
-; RV64I-NEXT:    sb a1, 4(a2)
-; RV64I-NEXT:    sb a5, 5(a2)
-; RV64I-NEXT:    sb a4, 6(a2)
-; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    srli a0, a0, 32
-; RV64I-NEXT:    sb a0, 12(a2)
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    sb a3, 14(a2)
-; RV64I-NEXT:    sb a1, 15(a2)
-; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    sb a3, 21(a2)
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    sb a0, 23(a2)
+; RV64I-NEXT:    sb a5, 4(a2)
+; RV64I-NEXT:    sb t1, 5(a2)
+; RV64I-NEXT:    sb a7, 6(a2)
+; RV64I-NEXT:    sb a6, 7(a2)
+; RV64I-NEXT:    sb t0, 12(a2)
+; RV64I-NEXT:    sb t4, 13(a2)
+; RV64I-NEXT:    sb t3, 14(a2)
+; RV64I-NEXT:    sb t2, 15(a2)
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_32bytes_wordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a7, 0(a0)
+; RV32I-NEXT:    lbu t0, 1(a0)
+; RV32I-NEXT:    lbu t1, 2(a0)
+; RV32I-NEXT:    lbu s1, 3(a0)
+; RV32I-NEXT:    lbu s7, 4(a0)
+; RV32I-NEXT:    lbu s8, 5(a0)
+; RV32I-NEXT:    lbu s4, 6(a0)
+; RV32I-NEXT:    lbu s6, 7(a0)
+; RV32I-NEXT:    lbu s5, 8(a0)
+; RV32I-NEXT:    lbu s10, 9(a0)
+; RV32I-NEXT:    lbu s11, 10(a0)
+; RV32I-NEXT:    lbu ra, 11(a0)
+; RV32I-NEXT:    lbu t4, 12(a0)
+; RV32I-NEXT:    lbu t6, 13(a0)
+; RV32I-NEXT:    lbu a5, 14(a0)
+; RV32I-NEXT:    lbu a6, 15(a0)
+; RV32I-NEXT:    lbu a3, 16(a0)
+; RV32I-NEXT:    lbu t2, 17(a0)
+; RV32I-NEXT:    lbu t3, 18(a0)
+; RV32I-NEXT:    lbu t5, 19(a0)
+; RV32I-NEXT:    lbu a4, 20(a0)
+; RV32I-NEXT:    lbu s0, 21(a0)
+; RV32I-NEXT:    lbu s2, 22(a0)
+; RV32I-NEXT:    lbu s3, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a6, t1, a6
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 17(a0)
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    lbu a7, 18(a0)
-; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a7, t2, a7
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    lbu t0, 22(a0)
-; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t0, t3, t0
-; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 25(a0)
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    lbu t1, 26(a0)
-; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    or t2, t3, t2
 ; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t1, t4, t1
-; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 29(a0)
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    lbu t2, 30(a0)
+; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, s1, t1
+; RV32I-NEXT:    or t1, s8, s7
+; RV32I-NEXT:    lbu s1, 24(a0)
+; RV32I-NEXT:    lbu s7, 25(a0)
+; RV32I-NEXT:    lbu s8, 26(a0)
+; RV32I-NEXT:    lbu s9, 27(a0)
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    slli s11, s11, 16
+; RV32I-NEXT:    slli ra, ra, 24
+; RV32I-NEXT:    or s4, s6, s4
+; RV32I-NEXT:    or s5, s10, s5
+; RV32I-NEXT:    or s6, ra, s11
+; RV32I-NEXT:    lbu s10, 28(a0)
+; RV32I-NEXT:    lbu s11, 29(a0)
+; RV32I-NEXT:    lbu ra, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t2
-; RV32I-NEXT:    or a0, a0, t3
 ; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sw zero, 48(sp)
-; RV32I-NEXT:    sw zero, 52(sp)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
-; RV32I-NEXT:    sw zero, 32(sp)
-; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 64(sp)
+; RV32I-NEXT:    sw zero, 68(sp)
 ; RV32I-NEXT:    sw zero, 40(sp)
 ; RV32I-NEXT:    sw zero, 44(sp)
-; RV32I-NEXT:    sw a7, 16(sp)
-; RV32I-NEXT:    sw t0, 20(sp)
-; RV32I-NEXT:    sw t1, 24(sp)
-; RV32I-NEXT:    sw a0, 28(sp)
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    or t4, t6, t4
+; RV32I-NEXT:    addi t6, sp, 8
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a3, t2, a3
+; RV32I-NEXT:    or a6, t5, t3
+; RV32I-NEXT:    or a4, s0, a4
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    or t3, s7, s1
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    or s0, s11, s10
+; RV32I-NEXT:    or a0, a0, ra
 ; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    mv a0, sp
-; RV32I-NEXT:    add a4, a0, a1
-; RV32I-NEXT:    lw a5, 16(a4)
-; RV32I-NEXT:    lw a6, 20(a4)
-; RV32I-NEXT:    lw a7, 24(a4)
-; RV32I-NEXT:    lw a1, 0(a4)
-; RV32I-NEXT:    lw a0, 4(a4)
-; RV32I-NEXT:    lw t0, 8(a4)
-; RV32I-NEXT:    lw a3, 12(a4)
-; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, s4, t1
+; RV32I-NEXT:    or t1, s6, s5
+; RV32I-NEXT:    or a5, a5, t4
+; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    or a4, t2, a4
+; RV32I-NEXT:    or a6, t5, t3
+; RV32I-NEXT:    or a0, a0, s0
+; RV32I-NEXT:    add t6, t6, a1
+; RV32I-NEXT:    sw a3, 24(sp)
+; RV32I-NEXT:    sw a4, 28(sp)
+; RV32I-NEXT:    sw a6, 32(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    sw a7, 8(sp)
+; RV32I-NEXT:    sw t0, 12(sp)
+; RV32I-NEXT:    sw t1, 16(sp)
+; RV32I-NEXT:    sw a5, 20(sp)
+; RV32I-NEXT:    lw a6, 16(t6)
+; RV32I-NEXT:    lw a5, 20(t6)
+; RV32I-NEXT:    lw a7, 24(t6)
+; RV32I-NEXT:    lw a1, 0(t6)
+; RV32I-NEXT:    lw a0, 4(t6)
+; RV32I-NEXT:    lw a4, 8(t6)
+; RV32I-NEXT:    lw a3, 12(t6)
+; RV32I-NEXT:    lw t0, 28(t6)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
+; RV32I-NEXT:    srli t4, t0, 24
+; RV32I-NEXT:    srli t5, t0, 16
+; RV32I-NEXT:    srli t6, t0, 8
+; RV32I-NEXT:    srli s0, a6, 24
+; RV32I-NEXT:    srli s1, a6, 16
+; RV32I-NEXT:    srli s2, a6, 8
+; RV32I-NEXT:    srli s3, a5, 24
+; RV32I-NEXT:    srli s4, a5, 16
+; RV32I-NEXT:    srli s5, a5, 8
+; RV32I-NEXT:    srli s6, a4, 24
+; RV32I-NEXT:    srli s7, a4, 16
+; RV32I-NEXT:    srli s8, a4, 8
+; RV32I-NEXT:    srli s9, a3, 24
+; RV32I-NEXT:    srli s10, a3, 16
+; RV32I-NEXT:    srli s11, a3, 8
+; RV32I-NEXT:    srli ra, a1, 24
 ; RV32I-NEXT:    sb a7, 24(a2)
 ; RV32I-NEXT:    sb t3, 25(a2)
 ; RV32I-NEXT:    sb t2, 26(a2)
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli a7, a4, 24
-; RV32I-NEXT:    srli t1, a4, 16
-; RV32I-NEXT:    srli t2, a4, 8
-; RV32I-NEXT:    sb a4, 28(a2)
-; RV32I-NEXT:    sb t2, 29(a2)
-; RV32I-NEXT:    sb t1, 30(a2)
-; RV32I-NEXT:    sb a7, 31(a2)
-; RV32I-NEXT:    srli a4, a5, 24
-; RV32I-NEXT:    srli a7, a5, 16
-; RV32I-NEXT:    srli t1, a5, 8
-; RV32I-NEXT:    sb a5, 16(a2)
-; RV32I-NEXT:    sb t1, 17(a2)
-; RV32I-NEXT:    sb a7, 18(a2)
-; RV32I-NEXT:    sb a4, 19(a2)
-; RV32I-NEXT:    srli a4, a6, 24
-; RV32I-NEXT:    srli a5, a6, 16
-; RV32I-NEXT:    srli a7, a6, 8
-; RV32I-NEXT:    sb a6, 20(a2)
-; RV32I-NEXT:    sb a7, 21(a2)
-; RV32I-NEXT:    sb a5, 22(a2)
-; RV32I-NEXT:    sb a4, 23(a2)
-; RV32I-NEXT:    srli a4, t0, 24
-; RV32I-NEXT:    srli a5, t0, 16
-; RV32I-NEXT:    srli a6, t0, 8
-; RV32I-NEXT:    sb t0, 8(a2)
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    sb a5, 10(a2)
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    srli a5, a3, 16
-; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    srli a7, a1, 16
+; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    sb t6, 29(a2)
+; RV32I-NEXT:    sb t5, 30(a2)
+; RV32I-NEXT:    sb t4, 31(a2)
+; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    sb a6, 16(a2)
+; RV32I-NEXT:    sb s2, 17(a2)
+; RV32I-NEXT:    sb s1, 18(a2)
+; RV32I-NEXT:    sb s0, 19(a2)
+; RV32I-NEXT:    srli a6, a0, 24
+; RV32I-NEXT:    sb a5, 20(a2)
+; RV32I-NEXT:    sb s5, 21(a2)
+; RV32I-NEXT:    sb s4, 22(a2)
+; RV32I-NEXT:    sb s3, 23(a2)
+; RV32I-NEXT:    srli a5, a0, 16
+; RV32I-NEXT:    sb a4, 8(a2)
+; RV32I-NEXT:    sb s8, 9(a2)
+; RV32I-NEXT:    sb s7, 10(a2)
+; RV32I-NEXT:    sb s6, 11(a2)
+; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a6, 13(a2)
-; RV32I-NEXT:    sb a5, 14(a2)
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    srli a4, a1, 16
-; RV32I-NEXT:    srli a5, a1, 8
+; RV32I-NEXT:    sb s11, 13(a2)
+; RV32I-NEXT:    sb s10, 14(a2)
+; RV32I-NEXT:    sb s9, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    sb a4, 2(a2)
-; RV32I-NEXT:    sb a3, 3(a2)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    sb t0, 1(a2)
+; RV32I-NEXT:    sb a7, 2(a2)
+; RV32I-NEXT:    sb ra, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    sb a5, 6(a2)
+; RV32I-NEXT:    sb a6, 7(a2)
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -2793,344 +2887,394 @@ define void @lshr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_32bytes_dwordOff:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -64
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
-; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 10(a0)
-; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    lbu a5, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 17(a0)
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 18(a0)
-; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a5, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
+; RV64I-NEXT:    lbu t2, 2(a0)
+; RV64I-NEXT:    lbu s3, 3(a0)
+; RV64I-NEXT:    lbu t0, 4(a0)
+; RV64I-NEXT:    lbu s8, 5(a0)
+; RV64I-NEXT:    lbu s9, 6(a0)
+; RV64I-NEXT:    lbu s10, 7(a0)
+; RV64I-NEXT:    lbu s2, 8(a0)
+; RV64I-NEXT:    lbu s4, 9(a0)
+; RV64I-NEXT:    lbu s5, 10(a0)
+; RV64I-NEXT:    lbu s6, 11(a0)
+; RV64I-NEXT:    lbu s7, 12(a0)
+; RV64I-NEXT:    lbu s11, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t3, 15(a0)
+; RV64I-NEXT:    lbu a3, 16(a0)
+; RV64I-NEXT:    lbu a6, 17(a0)
+; RV64I-NEXT:    lbu t4, 18(a0)
+; RV64I-NEXT:    lbu t5, 19(a0)
+; RV64I-NEXT:    lbu a4, 20(a0)
+; RV64I-NEXT:    lbu t6, 21(a0)
+; RV64I-NEXT:    lbu s0, 22(a0)
+; RV64I-NEXT:    lbu s1, 23(a0)
 ; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 21(a0)
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    lbu a6, 22(a0)
-; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 29(a0)
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    lbu a7, 30(a0)
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    slli s9, s9, 16
+; RV64I-NEXT:    slli s10, s10, 24
+; RV64I-NEXT:    or a5, a7, a5
+; RV64I-NEXT:    or a7, s3, t2
+; RV64I-NEXT:    or t0, s8, t0
+; RV64I-NEXT:    or t2, s10, s9
+; RV64I-NEXT:    lbu s3, 24(a0)
+; RV64I-NEXT:    lbu s8, 25(a0)
+; RV64I-NEXT:    lbu s9, 26(a0)
+; RV64I-NEXT:    lbu s10, 27(a0)
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s11, s11, 8
+; RV64I-NEXT:    or s2, s4, s2
+; RV64I-NEXT:    or s4, s6, s5
+; RV64I-NEXT:    or s5, s11, s7
+; RV64I-NEXT:    lbu s6, 28(a0)
+; RV64I-NEXT:    lbu s7, 29(a0)
+; RV64I-NEXT:    lbu s11, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    sd zero, 32(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
 ; RV64I-NEXT:    sd zero, 48(sp)
 ; RV64I-NEXT:    sd zero, 56(sp)
-; RV64I-NEXT:    sd a3, 0(sp)
-; RV64I-NEXT:    sd a4, 8(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
-; RV64I-NEXT:    sd a0, 24(sp)
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or t1, t3, t1
+; RV64I-NEXT:    mv t3, sp
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    slli t5, t5, 24
+; RV64I-NEXT:    slli t6, t6, 8
+; RV64I-NEXT:    slli s0, s0, 16
+; RV64I-NEXT:    slli s1, s1, 24
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    slli s9, s9, 16
+; RV64I-NEXT:    slli s10, s10, 24
+; RV64I-NEXT:    slli s7, s7, 8
+; RV64I-NEXT:    slli s11, s11, 16
+; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a6, t5, t4
+; RV64I-NEXT:    or a4, t6, a4
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    or t4, s8, s3
+; RV64I-NEXT:    or t5, s10, s9
+; RV64I-NEXT:    or t6, s7, s6
+; RV64I-NEXT:    or a0, a0, s11
 ; RV64I-NEXT:    andi a1, a1, 24
-; RV64I-NEXT:    mv a0, sp
-; RV64I-NEXT:    add a3, a0, a1
-; RV64I-NEXT:    ld a4, 16(a3)
-; RV64I-NEXT:    ld a0, 8(a3)
-; RV64I-NEXT:    ld a1, 0(a3)
-; RV64I-NEXT:    ld a3, 24(a3)
+; RV64I-NEXT:    or a5, a7, a5
+; RV64I-NEXT:    or a7, t2, t0
+; RV64I-NEXT:    or t0, s4, s2
+; RV64I-NEXT:    or t1, t1, s5
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a4, s0, a4
+; RV64I-NEXT:    or a6, t5, t4
+; RV64I-NEXT:    or a0, a0, t6
+; RV64I-NEXT:    add t3, t3, a1
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a1, a7, a5
+; RV64I-NEXT:    or a5, t1, t0
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    sd a1, 0(sp)
+; RV64I-NEXT:    sd a5, 8(sp)
+; RV64I-NEXT:    sd a3, 16(sp)
+; RV64I-NEXT:    sd a0, 24(sp)
+; RV64I-NEXT:    ld a4, 16(t3)
+; RV64I-NEXT:    ld a0, 8(t3)
+; RV64I-NEXT:    ld a1, 0(t3)
+; RV64I-NEXT:    ld a3, 24(t3)
 ; RV64I-NEXT:    srli a5, a4, 56
 ; RV64I-NEXT:    srli a6, a4, 48
 ; RV64I-NEXT:    srli a7, a4, 40
 ; RV64I-NEXT:    srli t0, a4, 32
+; RV64I-NEXT:    srli t1, a4, 24
+; RV64I-NEXT:    srli t2, a4, 16
+; RV64I-NEXT:    srli t3, a4, 8
+; RV64I-NEXT:    srli t4, a3, 56
+; RV64I-NEXT:    srli t5, a3, 48
+; RV64I-NEXT:    srli t6, a3, 40
+; RV64I-NEXT:    srli s0, a3, 32
+; RV64I-NEXT:    srli s1, a3, 24
+; RV64I-NEXT:    srli s2, a3, 16
+; RV64I-NEXT:    srli s3, a3, 8
+; RV64I-NEXT:    srli s4, a1, 56
+; RV64I-NEXT:    srli s5, a1, 48
+; RV64I-NEXT:    srli s6, a1, 40
+; RV64I-NEXT:    srli s7, a1, 32
+; RV64I-NEXT:    srli s8, a1, 24
+; RV64I-NEXT:    srli s9, a1, 16
+; RV64I-NEXT:    srli s10, a1, 8
+; RV64I-NEXT:    srli s11, a0, 56
 ; RV64I-NEXT:    sb t0, 20(a2)
 ; RV64I-NEXT:    sb a7, 21(a2)
 ; RV64I-NEXT:    sb a6, 22(a2)
 ; RV64I-NEXT:    sb a5, 23(a2)
-; RV64I-NEXT:    srli a5, a4, 24
-; RV64I-NEXT:    srli a6, a4, 16
-; RV64I-NEXT:    srli a7, a4, 8
+; RV64I-NEXT:    srli a5, a0, 48
 ; RV64I-NEXT:    sb a4, 16(a2)
-; RV64I-NEXT:    sb a7, 17(a2)
-; RV64I-NEXT:    sb a6, 18(a2)
-; RV64I-NEXT:    sb a5, 19(a2)
-; RV64I-NEXT:    srli a4, a3, 56
-; RV64I-NEXT:    srli a5, a3, 48
-; RV64I-NEXT:    srli a6, a3, 40
-; RV64I-NEXT:    srli a7, a3, 32
-; RV64I-NEXT:    sb a7, 28(a2)
-; RV64I-NEXT:    sb a6, 29(a2)
-; RV64I-NEXT:    sb a5, 30(a2)
-; RV64I-NEXT:    sb a4, 31(a2)
-; RV64I-NEXT:    srli a4, a3, 24
-; RV64I-NEXT:    srli a5, a3, 16
-; RV64I-NEXT:    srli a6, a3, 8
+; RV64I-NEXT:    sb t3, 17(a2)
+; RV64I-NEXT:    sb t2, 18(a2)
+; RV64I-NEXT:    sb t1, 19(a2)
+; RV64I-NEXT:    srli a4, a0, 40
+; RV64I-NEXT:    sb s0, 28(a2)
+; RV64I-NEXT:    sb t6, 29(a2)
+; RV64I-NEXT:    sb t5, 30(a2)
+; RV64I-NEXT:    sb t4, 31(a2)
+; RV64I-NEXT:    srli a6, a0, 32
 ; RV64I-NEXT:    sb a3, 24(a2)
-; RV64I-NEXT:    sb a6, 25(a2)
-; RV64I-NEXT:    sb a5, 26(a2)
-; RV64I-NEXT:    sb a4, 27(a2)
-; RV64I-NEXT:    srli a3, a1, 56
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    srli a6, a1, 32
-; RV64I-NEXT:    sb a6, 4(a2)
-; RV64I-NEXT:    sb a5, 5(a2)
-; RV64I-NEXT:    sb a4, 6(a2)
-; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a5, a1, 8
+; RV64I-NEXT:    sb s3, 25(a2)
+; RV64I-NEXT:    sb s2, 26(a2)
+; RV64I-NEXT:    sb s1, 27(a2)
+; RV64I-NEXT:    srli a3, a0, 24
+; RV64I-NEXT:    sb s7, 4(a2)
+; RV64I-NEXT:    sb s6, 5(a2)
+; RV64I-NEXT:    sb s5, 6(a2)
+; RV64I-NEXT:    sb s4, 7(a2)
+; RV64I-NEXT:    srli a7, a0, 16
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb a5, 1(a2)
-; RV64I-NEXT:    sb a4, 2(a2)
-; RV64I-NEXT:    sb a3, 3(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 12(a2)
+; RV64I-NEXT:    sb s10, 1(a2)
+; RV64I-NEXT:    sb s9, 2(a2)
+; RV64I-NEXT:    sb s8, 3(a2)
+; RV64I-NEXT:    srli a1, a0, 8
+; RV64I-NEXT:    sb a6, 12(a2)
 ; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    sb a3, 14(a2)
-; RV64I-NEXT:    sb a1, 15(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    srli a3, a0, 16
-; RV64I-NEXT:    srli a4, a0, 8
+; RV64I-NEXT:    sb a5, 14(a2)
+; RV64I-NEXT:    sb s11, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    sb a4, 9(a2)
-; RV64I-NEXT:    sb a3, 10(a2)
-; RV64I-NEXT:    sb a1, 11(a2)
-; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    sb a7, 10(a2)
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_32bytes_dwordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a7, 0(a0)
+; RV32I-NEXT:    lbu t0, 1(a0)
+; RV32I-NEXT:    lbu t1, 2(a0)
+; RV32I-NEXT:    lbu s1, 3(a0)
+; RV32I-NEXT:    lbu s7, 4(a0)
+; RV32I-NEXT:    lbu s8, 5(a0)
+; RV32I-NEXT:    lbu s4, 6(a0)
+; RV32I-NEXT:    lbu s6, 7(a0)
+; RV32I-NEXT:    lbu s5, 8(a0)
+; RV32I-NEXT:    lbu s10, 9(a0)
+; RV32I-NEXT:    lbu s11, 10(a0)
+; RV32I-NEXT:    lbu ra, 11(a0)
+; RV32I-NEXT:    lbu t4, 12(a0)
+; RV32I-NEXT:    lbu t6, 13(a0)
+; RV32I-NEXT:    lbu a5, 14(a0)
+; RV32I-NEXT:    lbu a6, 15(a0)
+; RV32I-NEXT:    lbu a3, 16(a0)
+; RV32I-NEXT:    lbu t2, 17(a0)
+; RV32I-NEXT:    lbu t3, 18(a0)
+; RV32I-NEXT:    lbu t5, 19(a0)
+; RV32I-NEXT:    lbu a4, 20(a0)
+; RV32I-NEXT:    lbu s0, 21(a0)
+; RV32I-NEXT:    lbu s2, 22(a0)
+; RV32I-NEXT:    lbu s3, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a6, t1, a6
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 17(a0)
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    lbu a7, 18(a0)
-; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a7, t2, a7
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    lbu t0, 22(a0)
-; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t0, t3, t0
-; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 25(a0)
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    lbu t1, 26(a0)
-; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    or t2, t3, t2
 ; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t1, t4, t1
-; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 29(a0)
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    lbu t2, 30(a0)
+; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, s1, t1
+; RV32I-NEXT:    or t1, s8, s7
+; RV32I-NEXT:    lbu s1, 24(a0)
+; RV32I-NEXT:    lbu s7, 25(a0)
+; RV32I-NEXT:    lbu s8, 26(a0)
+; RV32I-NEXT:    lbu s9, 27(a0)
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    slli s11, s11, 16
+; RV32I-NEXT:    slli ra, ra, 24
+; RV32I-NEXT:    or s4, s6, s4
+; RV32I-NEXT:    or s5, s10, s5
+; RV32I-NEXT:    or s6, ra, s11
+; RV32I-NEXT:    lbu s10, 28(a0)
+; RV32I-NEXT:    lbu s11, 29(a0)
+; RV32I-NEXT:    lbu ra, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t2
-; RV32I-NEXT:    or a0, a0, t3
 ; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sw zero, 48(sp)
-; RV32I-NEXT:    sw zero, 52(sp)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
-; RV32I-NEXT:    sw zero, 32(sp)
-; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 64(sp)
+; RV32I-NEXT:    sw zero, 68(sp)
 ; RV32I-NEXT:    sw zero, 40(sp)
 ; RV32I-NEXT:    sw zero, 44(sp)
-; RV32I-NEXT:    sw a7, 16(sp)
-; RV32I-NEXT:    sw t0, 20(sp)
-; RV32I-NEXT:    sw t1, 24(sp)
-; RV32I-NEXT:    sw a0, 28(sp)
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    or t4, t6, t4
+; RV32I-NEXT:    addi t6, sp, 8
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a3, t2, a3
+; RV32I-NEXT:    or a6, t5, t3
+; RV32I-NEXT:    or a4, s0, a4
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    or t3, s7, s1
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    or s0, s11, s10
+; RV32I-NEXT:    or a0, a0, ra
 ; RV32I-NEXT:    andi a1, a1, 24
-; RV32I-NEXT:    mv a0, sp
-; RV32I-NEXT:    add a4, a0, a1
-; RV32I-NEXT:    lw a5, 16(a4)
-; RV32I-NEXT:    lw a6, 20(a4)
-; RV32I-NEXT:    lw a7, 24(a4)
-; RV32I-NEXT:    lw a1, 0(a4)
-; RV32I-NEXT:    lw a0, 4(a4)
-; RV32I-NEXT:    lw t0, 8(a4)
-; RV32I-NEXT:    lw a3, 12(a4)
-; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, s4, t1
+; RV32I-NEXT:    or t1, s6, s5
+; RV32I-NEXT:    or a5, a5, t4
+; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    or a4, t2, a4
+; RV32I-NEXT:    or a6, t5, t3
+; RV32I-NEXT:    or a0, a0, s0
+; RV32I-NEXT:    add t6, t6, a1
+; RV32I-NEXT:    sw a3, 24(sp)
+; RV32I-NEXT:    sw a4, 28(sp)
+; RV32I-NEXT:    sw a6, 32(sp)
+; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    sw a7, 8(sp)
+; RV32I-NEXT:    sw t0, 12(sp)
+; RV32I-NEXT:    sw t1, 16(sp)
+; RV32I-NEXT:    sw a5, 20(sp)
+; RV32I-NEXT:    lw a6, 16(t6)
+; RV32I-NEXT:    lw a5, 20(t6)
+; RV32I-NEXT:    lw a7, 24(t6)
+; RV32I-NEXT:    lw a1, 0(t6)
+; RV32I-NEXT:    lw a0, 4(t6)
+; RV32I-NEXT:    lw a4, 8(t6)
+; RV32I-NEXT:    lw a3, 12(t6)
+; RV32I-NEXT:    lw t0, 28(t6)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
+; RV32I-NEXT:    srli t4, t0, 24
+; RV32I-NEXT:    srli t5, t0, 16
+; RV32I-NEXT:    srli t6, t0, 8
+; RV32I-NEXT:    srli s0, a6, 24
+; RV32I-NEXT:    srli s1, a6, 16
+; RV32I-NEXT:    srli s2, a6, 8
+; RV32I-NEXT:    srli s3, a5, 24
+; RV32I-NEXT:    srli s4, a5, 16
+; RV32I-NEXT:    srli s5, a5, 8
+; RV32I-NEXT:    srli s6, a4, 24
+; RV32I-NEXT:    srli s7, a4, 16
+; RV32I-NEXT:    srli s8, a4, 8
+; RV32I-NEXT:    srli s9, a3, 24
+; RV32I-NEXT:    srli s10, a3, 16
+; RV32I-NEXT:    srli s11, a3, 8
+; RV32I-NEXT:    srli ra, a1, 24
 ; RV32I-NEXT:    sb a7, 24(a2)
 ; RV32I-NEXT:    sb t3, 25(a2)
 ; RV32I-NEXT:    sb t2, 26(a2)
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli a7, a4, 24
-; RV32I-NEXT:    srli t1, a4, 16
-; RV32I-NEXT:    srli t2, a4, 8
-; RV32I-NEXT:    sb a4, 28(a2)
-; RV32I-NEXT:    sb t2, 29(a2)
-; RV32I-NEXT:    sb t1, 30(a2)
-; RV32I-NEXT:    sb a7, 31(a2)
-; RV32I-NEXT:    srli a4, a5, 24
-; RV32I-NEXT:    srli a7, a5, 16
-; RV32I-NEXT:    srli t1, a5, 8
-; RV32I-NEXT:    sb a5, 16(a2)
-; RV32I-NEXT:    sb t1, 17(a2)
-; RV32I-NEXT:    sb a7, 18(a2)
-; RV32I-NEXT:    sb a4, 19(a2)
-; RV32I-NEXT:    srli a4, a6, 24
-; RV32I-NEXT:    srli a5, a6, 16
-; RV32I-NEXT:    srli a7, a6, 8
-; RV32I-NEXT:    sb a6, 20(a2)
-; RV32I-NEXT:    sb a7, 21(a2)
-; RV32I-NEXT:    sb a5, 22(a2)
-; RV32I-NEXT:    sb a4, 23(a2)
-; RV32I-NEXT:    srli a4, t0, 24
-; RV32I-NEXT:    srli a5, t0, 16
-; RV32I-NEXT:    srli a6, t0, 8
-; RV32I-NEXT:    sb t0, 8(a2)
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    sb a5, 10(a2)
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    srli a5, a3, 16
-; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    srli a7, a1, 16
+; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    sb t6, 29(a2)
+; RV32I-NEXT:    sb t5, 30(a2)
+; RV32I-NEXT:    sb t4, 31(a2)
+; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    sb a6, 16(a2)
+; RV32I-NEXT:    sb s2, 17(a2)
+; RV32I-NEXT:    sb s1, 18(a2)
+; RV32I-NEXT:    sb s0, 19(a2)
+; RV32I-NEXT:    srli a6, a0, 24
+; RV32I-NEXT:    sb a5, 20(a2)
+; RV32I-NEXT:    sb s5, 21(a2)
+; RV32I-NEXT:    sb s4, 22(a2)
+; RV32I-NEXT:    sb s3, 23(a2)
+; RV32I-NEXT:    srli a5, a0, 16
+; RV32I-NEXT:    sb a4, 8(a2)
+; RV32I-NEXT:    sb s8, 9(a2)
+; RV32I-NEXT:    sb s7, 10(a2)
+; RV32I-NEXT:    sb s6, 11(a2)
+; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a6, 13(a2)
-; RV32I-NEXT:    sb a5, 14(a2)
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    srli a4, a1, 16
-; RV32I-NEXT:    srli a5, a1, 8
+; RV32I-NEXT:    sb s11, 13(a2)
+; RV32I-NEXT:    sb s10, 14(a2)
+; RV32I-NEXT:    sb s9, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    sb a4, 2(a2)
-; RV32I-NEXT:    sb a3, 3(a2)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    sb t0, 1(a2)
+; RV32I-NEXT:    sb a7, 2(a2)
+; RV32I-NEXT:    sb ra, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    sb a5, 6(a2)
+; RV32I-NEXT:    sb a6, 7(a2)
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -3143,428 +3287,472 @@ define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
 define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -64
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 10(a0)
-; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    lbu a5, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 17(a0)
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 18(a0)
-; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 21(a0)
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    lbu a6, 22(a0)
-; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 29(a0)
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    lbu a7, 30(a0)
-; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 1(a1)
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    lbu a6, 2(a1)
-; RV64I-NEXT:    lbu t1, 3(a1)
 ; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    lbu t1, 5(a1)
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    lbu a7, 6(a1)
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    slli s2, s2, 16
+; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t6, 24(a0)
+; RV64I-NEXT:    lbu s0, 25(a0)
+; RV64I-NEXT:    lbu s1, 26(a0)
+; RV64I-NEXT:    lbu s2, 27(a0)
+; RV64I-NEXT:    slli s5, s5, 8
+; RV64I-NEXT:    slli s6, s6, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or t3, s5, s4
+; RV64I-NEXT:    or t4, s7, s6
+; RV64I-NEXT:    or t5, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
+; RV64I-NEXT:    lbu s6, 31(a0)
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s11, s10
+; RV64I-NEXT:    or t6, s0, t6
+; RV64I-NEXT:    or s0, s2, s1
+; RV64I-NEXT:    or s1, s4, s3
+; RV64I-NEXT:    lbu s2, 0(a1)
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or s5, s6, s5
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    lbu s3, 4(a1)
+; RV64I-NEXT:    lbu s6, 5(a1)
+; RV64I-NEXT:    or s4, s7, s4
+; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
-; RV64I-NEXT:    or a1, a1, t0
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    slli s6, s6, 8
+; RV64I-NEXT:    or s3, s6, s3
 ; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
 ; RV64I-NEXT:    sd zero, 24(sp)
+; RV64I-NEXT:    slli s7, s7, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, s7
+; RV64I-NEXT:    addi s6, sp, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    or t0, s0, t6
+; RV64I-NEXT:    or t1, s5, s1
+; RV64I-NEXT:    or t2, s4, s2
+; RV64I-NEXT:    or a1, a1, s3
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a5, t1, t0
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    sd a3, 32(sp)
 ; RV64I-NEXT:    sd a4, 40(sp)
-; RV64I-NEXT:    sd a5, 48(sp)
-; RV64I-NEXT:    sd a0, 56(sp)
-; RV64I-NEXT:    andi a0, a1, 24
-; RV64I-NEXT:    addi a3, sp, 32
-; RV64I-NEXT:    sub a3, a3, a0
-; RV64I-NEXT:    ld a5, 0(a3)
-; RV64I-NEXT:    ld a6, 8(a3)
-; RV64I-NEXT:    slli a7, a1, 3
-; RV64I-NEXT:    ld t0, 16(a3)
-; RV64I-NEXT:    ld a1, 24(a3)
-; RV64I-NEXT:    sll a4, a6, a7
-; RV64I-NEXT:    andi a0, a7, 56
-; RV64I-NEXT:    xori a3, a0, 63
-; RV64I-NEXT:    srli a0, a5, 1
-; RV64I-NEXT:    srl a0, a0, a3
-; RV64I-NEXT:    or a0, a4, a0
-; RV64I-NEXT:    sll t1, a1, a7
-; RV64I-NEXT:    srli a1, t0, 1
-; RV64I-NEXT:    srl a1, a1, a3
-; RV64I-NEXT:    or a1, t1, a1
-; RV64I-NEXT:    sll t0, t0, a7
-; RV64I-NEXT:    srli a6, a6, 1
-; RV64I-NEXT:    srl a3, a6, a3
-; RV64I-NEXT:    or a3, t0, a3
-; RV64I-NEXT:    sll a5, a5, a7
-; RV64I-NEXT:    srli a6, t0, 56
-; RV64I-NEXT:    srli a7, a3, 48
-; RV64I-NEXT:    srli t0, a3, 40
-; RV64I-NEXT:    srli t2, a3, 32
-; RV64I-NEXT:    sb t2, 20(a2)
-; RV64I-NEXT:    sb t0, 21(a2)
-; RV64I-NEXT:    sb a7, 22(a2)
-; RV64I-NEXT:    sb a6, 23(a2)
-; RV64I-NEXT:    srli a6, t1, 56
-; RV64I-NEXT:    srli a7, a1, 48
-; RV64I-NEXT:    srli t0, a1, 40
-; RV64I-NEXT:    srli t1, a1, 32
-; RV64I-NEXT:    sb t1, 28(a2)
-; RV64I-NEXT:    sb t0, 29(a2)
-; RV64I-NEXT:    sb a7, 30(a2)
-; RV64I-NEXT:    sb a6, 31(a2)
-; RV64I-NEXT:    srli a6, a5, 56
+; RV64I-NEXT:    sd a0, 48(sp)
+; RV64I-NEXT:    sd a5, 56(sp)
+; RV64I-NEXT:    slli a0, a1, 3
+; RV64I-NEXT:    andi a1, a1, 24
+; RV64I-NEXT:    sub a1, s6, a1
+; RV64I-NEXT:    andi a3, a0, 56
+; RV64I-NEXT:    ld a4, 0(a1)
+; RV64I-NEXT:    ld a5, 8(a1)
+; RV64I-NEXT:    ld a6, 16(a1)
+; RV64I-NEXT:    ld a1, 24(a1)
+; RV64I-NEXT:    xori a3, a3, 63
+; RV64I-NEXT:    sll a7, a5, a0
+; RV64I-NEXT:    srli t0, a4, 1
+; RV64I-NEXT:    sll t1, a1, a0
+; RV64I-NEXT:    srli a1, a6, 1
+; RV64I-NEXT:    sll t2, a6, a0
+; RV64I-NEXT:    srli a5, a5, 1
+; RV64I-NEXT:    sll t3, a4, a0
+; RV64I-NEXT:    srl a0, t0, a3
+; RV64I-NEXT:    srl a4, a1, a3
+; RV64I-NEXT:    srl a5, a5, a3
+; RV64I-NEXT:    srli a3, t2, 56
+; RV64I-NEXT:    srli a1, t1, 56
+; RV64I-NEXT:    srli t0, t3, 56
+; RV64I-NEXT:    srli t4, t3, 48
+; RV64I-NEXT:    srli t5, t3, 40
+; RV64I-NEXT:    srli t6, t3, 32
+; RV64I-NEXT:    srli s0, t3, 24
+; RV64I-NEXT:    srli s1, t3, 16
+; RV64I-NEXT:    srli s2, t3, 8
+; RV64I-NEXT:    srli a6, a7, 56
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    or a4, t1, a4
+; RV64I-NEXT:    or a5, t2, a5
+; RV64I-NEXT:    sb t6, 4(a2)
+; RV64I-NEXT:    sb t5, 5(a2)
+; RV64I-NEXT:    sb t4, 6(a2)
+; RV64I-NEXT:    sb t0, 7(a2)
+; RV64I-NEXT:    sb t3, 0(a2)
+; RV64I-NEXT:    sb s2, 1(a2)
+; RV64I-NEXT:    sb s1, 2(a2)
+; RV64I-NEXT:    sb s0, 3(a2)
 ; RV64I-NEXT:    srli a7, a5, 48
 ; RV64I-NEXT:    srli t0, a5, 40
 ; RV64I-NEXT:    srli t1, a5, 32
-; RV64I-NEXT:    sb t1, 4(a2)
-; RV64I-NEXT:    sb t0, 5(a2)
-; RV64I-NEXT:    sb a7, 6(a2)
-; RV64I-NEXT:    sb a6, 7(a2)
-; RV64I-NEXT:    srli a6, a5, 24
-; RV64I-NEXT:    srli a7, a5, 16
-; RV64I-NEXT:    srli t0, a5, 8
-; RV64I-NEXT:    sb a5, 0(a2)
-; RV64I-NEXT:    sb t0, 1(a2)
-; RV64I-NEXT:    sb a7, 2(a2)
-; RV64I-NEXT:    sb a6, 3(a2)
-; RV64I-NEXT:    srli a4, a4, 56
-; RV64I-NEXT:    srli a5, a0, 48
-; RV64I-NEXT:    srli a6, a0, 40
-; RV64I-NEXT:    srli a7, a0, 32
-; RV64I-NEXT:    sb a7, 12(a2)
-; RV64I-NEXT:    sb a6, 13(a2)
-; RV64I-NEXT:    sb a5, 14(a2)
-; RV64I-NEXT:    sb a4, 15(a2)
-; RV64I-NEXT:    srli a4, a3, 24
-; RV64I-NEXT:    srli a5, a3, 16
-; RV64I-NEXT:    srli a6, a3, 8
-; RV64I-NEXT:    sb a3, 16(a2)
-; RV64I-NEXT:    sb a6, 17(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a5, a1, 8
-; RV64I-NEXT:    sb a1, 24(a2)
-; RV64I-NEXT:    sb a5, 25(a2)
-; RV64I-NEXT:    sb a4, 26(a2)
-; RV64I-NEXT:    sb a3, 27(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    srli a3, a0, 16
-; RV64I-NEXT:    srli a4, a0, 8
+; RV64I-NEXT:    srli t2, a5, 24
+; RV64I-NEXT:    srli t3, a5, 16
+; RV64I-NEXT:    srli t4, a5, 8
+; RV64I-NEXT:    srli t5, a4, 48
+; RV64I-NEXT:    srli t6, a4, 40
+; RV64I-NEXT:    srli s0, a4, 32
+; RV64I-NEXT:    srli s1, a4, 24
+; RV64I-NEXT:    srli s2, a4, 16
+; RV64I-NEXT:    srli s3, a4, 8
+; RV64I-NEXT:    srli s4, a0, 48
+; RV64I-NEXT:    srli s5, a0, 40
+; RV64I-NEXT:    srli s6, a0, 32
+; RV64I-NEXT:    sb t1, 20(a2)
+; RV64I-NEXT:    sb t0, 21(a2)
+; RV64I-NEXT:    sb a7, 22(a2)
+; RV64I-NEXT:    sb a3, 23(a2)
+; RV64I-NEXT:    srli a3, a0, 24
+; RV64I-NEXT:    sb s0, 28(a2)
+; RV64I-NEXT:    sb t6, 29(a2)
+; RV64I-NEXT:    sb t5, 30(a2)
+; RV64I-NEXT:    sb a1, 31(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb s6, 12(a2)
+; RV64I-NEXT:    sb s5, 13(a2)
+; RV64I-NEXT:    sb s4, 14(a2)
+; RV64I-NEXT:    sb a6, 15(a2)
+; RV64I-NEXT:    srli a6, a0, 8
+; RV64I-NEXT:    sb a5, 16(a2)
+; RV64I-NEXT:    sb t4, 17(a2)
+; RV64I-NEXT:    sb t3, 18(a2)
+; RV64I-NEXT:    sb t2, 19(a2)
+; RV64I-NEXT:    sb a4, 24(a2)
+; RV64I-NEXT:    sb s3, 25(a2)
+; RV64I-NEXT:    sb s2, 26(a2)
+; RV64I-NEXT:    sb s1, 27(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    sb a4, 9(a2)
-; RV64I-NEXT:    sb a3, 10(a2)
-; RV64I-NEXT:    sb a1, 11(a2)
-; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    sb a6, 9(a2)
+; RV64I-NEXT:    sb a1, 10(a2)
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -80
-; RV32I-NEXT:    sw s0, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu s1, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu t1, 4(a0)
+; RV32I-NEXT:    lbu t3, 5(a0)
+; RV32I-NEXT:    lbu t4, 6(a0)
+; RV32I-NEXT:    lbu s0, 7(a0)
+; RV32I-NEXT:    lbu t2, 8(a0)
+; RV32I-NEXT:    lbu s3, 9(a0)
+; RV32I-NEXT:    lbu s6, 10(a0)
+; RV32I-NEXT:    lbu s8, 11(a0)
+; RV32I-NEXT:    lbu s9, 12(a0)
+; RV32I-NEXT:    lbu s10, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s7, 15(a0)
+; RV32I-NEXT:    lbu s5, 16(a0)
+; RV32I-NEXT:    lbu s11, 17(a0)
+; RV32I-NEXT:    lbu ra, 18(a0)
+; RV32I-NEXT:    lbu a3, 19(a0)
+; RV32I-NEXT:    lbu t5, 20(a0)
+; RV32I-NEXT:    lbu t6, 21(a0)
+; RV32I-NEXT:    lbu a7, 22(a0)
+; RV32I-NEXT:    lbu t0, 23(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a6, t1, a6
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 17(a0)
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    lbu a7, 18(a0)
-; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a7, t2, a7
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    lbu t0, 22(a0)
-; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t0, t3, t0
-; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 25(a0)
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    lbu t1, 26(a0)
-; RV32I-NEXT:    lbu t4, 27(a0)
 ; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    or t2, t3, t2
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t1, t4, t1
-; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 29(a0)
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    lbu t2, 30(a0)
-; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t2
-; RV32I-NEXT:    lbu t2, 0(a1)
-; RV32I-NEXT:    lbu t4, 1(a1)
-; RV32I-NEXT:    or a0, a0, t3
-; RV32I-NEXT:    lbu t3, 2(a1)
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli s0, s0, 24
+; RV32I-NEXT:    or a4, a4, s1
+; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t3, t1
+; RV32I-NEXT:    or a6, s0, t4
+; RV32I-NEXT:    lbu t1, 24(a0)
+; RV32I-NEXT:    lbu s0, 25(a0)
+; RV32I-NEXT:    lbu s1, 26(a0)
+; RV32I-NEXT:    lbu s2, 27(a0)
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s8, s8, 24
+; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    or t2, s3, t2
+; RV32I-NEXT:    or t3, s8, s6
+; RV32I-NEXT:    or t4, s10, s9
+; RV32I-NEXT:    lbu s3, 28(a0)
+; RV32I-NEXT:    lbu s6, 29(a0)
+; RV32I-NEXT:    lbu s8, 30(a0)
+; RV32I-NEXT:    lbu s9, 31(a0)
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, s7, s4
+; RV32I-NEXT:    or s4, s11, s5
+; RV32I-NEXT:    or s5, a3, ra
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu s7, 1(a1)
+; RV32I-NEXT:    lbu s10, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or t2, t4, t2
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t3
-; RV32I-NEXT:    or a1, a1, t2
-; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 0(sp)
-; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    sw a7, 48(sp)
-; RV32I-NEXT:    sw t0, 52(sp)
-; RV32I-NEXT:    sw t1, 56(sp)
-; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a3, 32(sp)
-; RV32I-NEXT:    sw a4, 36(sp)
-; RV32I-NEXT:    sw a5, 40(sp)
-; RV32I-NEXT:    sw a6, 44(sp)
-; RV32I-NEXT:    andi a0, a1, 28
-; RV32I-NEXT:    addi a3, sp, 32
-; RV32I-NEXT:    sub a7, a3, a0
-; RV32I-NEXT:    lw t3, 0(a7)
-; RV32I-NEXT:    lw a6, 4(a7)
-; RV32I-NEXT:    slli t4, a1, 3
-; RV32I-NEXT:    lw a5, 8(a7)
-; RV32I-NEXT:    lw t2, 12(a7)
-; RV32I-NEXT:    sll a0, a6, t4
-; RV32I-NEXT:    andi a1, t4, 24
-; RV32I-NEXT:    xori t5, a1, 31
-; RV32I-NEXT:    srli a1, t3, 1
-; RV32I-NEXT:    srl a1, a1, t5
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    sll a4, t2, t4
-; RV32I-NEXT:    srli a3, a5, 1
-; RV32I-NEXT:    srl a3, a3, t5
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    sll a5, a5, t4
-; RV32I-NEXT:    srli a6, a6, 1
-; RV32I-NEXT:    srl a6, a6, t5
-; RV32I-NEXT:    lw t6, 16(a7)
-; RV32I-NEXT:    lw s0, 20(a7)
-; RV32I-NEXT:    or a6, a5, a6
-; RV32I-NEXT:    lw s1, 24(a7)
-; RV32I-NEXT:    lw a7, 28(a7)
-; RV32I-NEXT:    sll t1, s0, t4
-; RV32I-NEXT:    srli t0, t6, 1
-; RV32I-NEXT:    srl t0, t0, t5
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    or t5, t6, t5
+; RV32I-NEXT:    addi t6, sp, 40
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli s2, s2, 24
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, s0, t1
+; RV32I-NEXT:    or t1, s2, s1
+; RV32I-NEXT:    or s0, s6, s3
+; RV32I-NEXT:    or s1, s9, s8
+; RV32I-NEXT:    or a3, s7, a3
+; RV32I-NEXT:    or a1, a1, s10
+; RV32I-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, s2
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t3, t2
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    or a7, a7, t5
 ; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    sll t6, t6, t4
-; RV32I-NEXT:    srli t2, t2, 1
-; RV32I-NEXT:    srl t2, t2, t5
-; RV32I-NEXT:    or t2, t6, t2
-; RV32I-NEXT:    sll s2, a7, t4
-; RV32I-NEXT:    srli a7, s1, 1
-; RV32I-NEXT:    srl a7, a7, t5
-; RV32I-NEXT:    or s3, s2, a7
-; RV32I-NEXT:    sll s1, s1, t4
-; RV32I-NEXT:    srli s0, s0, 1
-; RV32I-NEXT:    srl a7, s0, t5
-; RV32I-NEXT:    or t5, s1, a7
-; RV32I-NEXT:    sll a7, t3, t4
-; RV32I-NEXT:    srli s1, s1, 24
-; RV32I-NEXT:    srli t3, t5, 16
-; RV32I-NEXT:    srli t4, t5, 8
-; RV32I-NEXT:    sb t5, 24(a2)
-; RV32I-NEXT:    sb t4, 25(a2)
-; RV32I-NEXT:    sb t3, 26(a2)
-; RV32I-NEXT:    sb s1, 27(a2)
-; RV32I-NEXT:    srli t3, s2, 24
-; RV32I-NEXT:    srli t4, s3, 16
-; RV32I-NEXT:    srli t5, s3, 8
-; RV32I-NEXT:    sb s3, 28(a2)
-; RV32I-NEXT:    sb t5, 29(a2)
-; RV32I-NEXT:    sb t4, 30(a2)
-; RV32I-NEXT:    sb t3, 31(a2)
-; RV32I-NEXT:    srli t3, t6, 24
-; RV32I-NEXT:    srli t4, t2, 16
-; RV32I-NEXT:    srli t5, t2, 8
-; RV32I-NEXT:    sb t2, 16(a2)
-; RV32I-NEXT:    sb t5, 17(a2)
-; RV32I-NEXT:    sb t4, 18(a2)
-; RV32I-NEXT:    sb t3, 19(a2)
-; RV32I-NEXT:    srli t1, t1, 24
-; RV32I-NEXT:    srli t2, t0, 16
-; RV32I-NEXT:    srli t3, t0, 8
-; RV32I-NEXT:    sb t0, 20(a2)
-; RV32I-NEXT:    sb t3, 21(a2)
-; RV32I-NEXT:    sb t2, 22(a2)
-; RV32I-NEXT:    sb t1, 23(a2)
-; RV32I-NEXT:    srli a5, a5, 24
-; RV32I-NEXT:    srli t0, a6, 16
-; RV32I-NEXT:    srli t1, a6, 8
-; RV32I-NEXT:    sb a6, 8(a2)
-; RV32I-NEXT:    sb t1, 9(a2)
-; RV32I-NEXT:    sb t0, 10(a2)
-; RV32I-NEXT:    sb a5, 11(a2)
-; RV32I-NEXT:    srli a4, a4, 24
-; RV32I-NEXT:    srli a5, a3, 16
-; RV32I-NEXT:    srli a6, a3, 8
-; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a6, 13(a2)
-; RV32I-NEXT:    sb a5, 14(a2)
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a7, 24
-; RV32I-NEXT:    srli a4, a7, 16
-; RV32I-NEXT:    srli a5, a7, 8
-; RV32I-NEXT:    sb a7, 0(a2)
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    sb a4, 2(a2)
-; RV32I-NEXT:    sb a3, 3(a2)
-; RV32I-NEXT:    srli a0, a0, 24
-; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    srli a4, a1, 8
-; RV32I-NEXT:    sb a1, 4(a2)
-; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    sb a0, 7(a2)
-; RV32I-NEXT:    lw s0, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 80
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    sw t2, 56(sp)
+; RV32I-NEXT:    sw a7, 60(sp)
+; RV32I-NEXT:    sw t0, 64(sp)
+; RV32I-NEXT:    sw s0, 68(sp)
+; RV32I-NEXT:    sw a4, 40(sp)
+; RV32I-NEXT:    sw a5, 44(sp)
+; RV32I-NEXT:    sw a6, 48(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    slli a3, a1, 3
+; RV32I-NEXT:    andi a1, a1, 28
+; RV32I-NEXT:    sub a1, t6, a1
+; RV32I-NEXT:    andi a0, a3, 24
+; RV32I-NEXT:    xori a0, a0, 31
+; RV32I-NEXT:    lw a4, 0(a1)
+; RV32I-NEXT:    lw a5, 4(a1)
+; RV32I-NEXT:    lw a6, 8(a1)
+; RV32I-NEXT:    lw a7, 12(a1)
+; RV32I-NEXT:    lw t0, 16(a1)
+; RV32I-NEXT:    lw t1, 20(a1)
+; RV32I-NEXT:    lw t2, 24(a1)
+; RV32I-NEXT:    lw a1, 28(a1)
+; RV32I-NEXT:    sll t3, a5, a3
+; RV32I-NEXT:    srli t4, a4, 1
+; RV32I-NEXT:    sll t5, a7, a3
+; RV32I-NEXT:    srli t6, a6, 1
+; RV32I-NEXT:    sll s0, a6, a3
+; RV32I-NEXT:    srli a5, a5, 1
+; RV32I-NEXT:    sll s1, t1, a3
+; RV32I-NEXT:    srli a6, t0, 1
+; RV32I-NEXT:    sll s2, t0, a3
+; RV32I-NEXT:    srli a7, a7, 1
+; RV32I-NEXT:    sll s3, a1, a3
+; RV32I-NEXT:    srli a1, t2, 1
+; RV32I-NEXT:    sll s4, t2, a3
+; RV32I-NEXT:    srli t0, t1, 1
+; RV32I-NEXT:    sll s5, a4, a3
+; RV32I-NEXT:    srl t2, t4, a0
+; RV32I-NEXT:    srl t4, t6, a0
+; RV32I-NEXT:    srl t6, a5, a0
+; RV32I-NEXT:    srl s6, a6, a0
+; RV32I-NEXT:    srl s7, a7, a0
+; RV32I-NEXT:    srl s8, a1, a0
+; RV32I-NEXT:    srl s9, t0, a0
+; RV32I-NEXT:    srli t1, s4, 24
+; RV32I-NEXT:    srli a7, s3, 24
+; RV32I-NEXT:    srli a5, s2, 24
+; RV32I-NEXT:    srli a3, s1, 24
+; RV32I-NEXT:    srli a1, s0, 24
+; RV32I-NEXT:    srli a0, t5, 24
+; RV32I-NEXT:    srli s10, s5, 24
+; RV32I-NEXT:    srli s11, s5, 16
+; RV32I-NEXT:    srli ra, s5, 8
+; RV32I-NEXT:    srli a4, t3, 24
+; RV32I-NEXT:    or a6, t3, t2
+; RV32I-NEXT:    or t0, t5, t4
+; RV32I-NEXT:    or t2, s0, t6
+; RV32I-NEXT:    or t3, s1, s6
+; RV32I-NEXT:    or t4, s2, s7
+; RV32I-NEXT:    or t5, s3, s8
+; RV32I-NEXT:    or t6, s4, s9
+; RV32I-NEXT:    sb s5, 0(a2)
+; RV32I-NEXT:    sb ra, 1(a2)
+; RV32I-NEXT:    sb s11, 2(a2)
+; RV32I-NEXT:    sb s10, 3(a2)
+; RV32I-NEXT:    srli s0, t6, 16
+; RV32I-NEXT:    srli s1, t6, 8
+; RV32I-NEXT:    srli s2, t5, 16
+; RV32I-NEXT:    srli s3, t5, 8
+; RV32I-NEXT:    srli s4, t4, 16
+; RV32I-NEXT:    srli s5, t4, 8
+; RV32I-NEXT:    srli s6, t3, 16
+; RV32I-NEXT:    srli s7, t3, 8
+; RV32I-NEXT:    srli s8, t2, 16
+; RV32I-NEXT:    srli s9, t2, 8
+; RV32I-NEXT:    srli s10, t0, 16
+; RV32I-NEXT:    srli s11, t0, 8
+; RV32I-NEXT:    sb t6, 24(a2)
+; RV32I-NEXT:    sb s1, 25(a2)
+; RV32I-NEXT:    sb s0, 26(a2)
+; RV32I-NEXT:    sb t1, 27(a2)
+; RV32I-NEXT:    srli t1, a6, 16
+; RV32I-NEXT:    sb t5, 28(a2)
+; RV32I-NEXT:    sb s3, 29(a2)
+; RV32I-NEXT:    sb s2, 30(a2)
+; RV32I-NEXT:    sb a7, 31(a2)
+; RV32I-NEXT:    srli a7, a6, 8
+; RV32I-NEXT:    sb t4, 16(a2)
+; RV32I-NEXT:    sb s5, 17(a2)
+; RV32I-NEXT:    sb s4, 18(a2)
+; RV32I-NEXT:    sb a5, 19(a2)
+; RV32I-NEXT:    sb t3, 20(a2)
+; RV32I-NEXT:    sb s7, 21(a2)
+; RV32I-NEXT:    sb s6, 22(a2)
+; RV32I-NEXT:    sb a3, 23(a2)
+; RV32I-NEXT:    sb t2, 8(a2)
+; RV32I-NEXT:    sb s9, 9(a2)
+; RV32I-NEXT:    sb s8, 10(a2)
+; RV32I-NEXT:    sb a1, 11(a2)
+; RV32I-NEXT:    sb t0, 12(a2)
+; RV32I-NEXT:    sb s11, 13(a2)
+; RV32I-NEXT:    sb s10, 14(a2)
+; RV32I-NEXT:    sb a0, 15(a2)
+; RV32I-NEXT:    sb a6, 4(a2)
+; RV32I-NEXT:    sb a7, 5(a2)
+; RV32I-NEXT:    sb t1, 6(a2)
+; RV32I-NEXT:    sb a4, 7(a2)
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -3577,381 +3765,431 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_32bytes_wordOff:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -64
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 10(a0)
-; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    lbu a5, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 17(a0)
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 18(a0)
-; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 21(a0)
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    lbu a6, 22(a0)
-; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 29(a0)
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    lbu a7, 30(a0)
-; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 1(a1)
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    lbu a6, 2(a1)
-; RV64I-NEXT:    lbu t1, 3(a1)
+; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    lbu t1, 5(a1)
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    lbu a7, 6(a1)
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    slli s2, s2, 16
+; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t6, 24(a0)
+; RV64I-NEXT:    lbu s0, 25(a0)
+; RV64I-NEXT:    lbu s1, 26(a0)
+; RV64I-NEXT:    lbu s2, 27(a0)
+; RV64I-NEXT:    slli s5, s5, 8
+; RV64I-NEXT:    slli s6, s6, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or t3, s5, s4
+; RV64I-NEXT:    or t4, s7, s6
+; RV64I-NEXT:    or t5, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
+; RV64I-NEXT:    lbu s6, 31(a0)
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s11, s10
+; RV64I-NEXT:    or t6, s0, t6
+; RV64I-NEXT:    or s0, s2, s1
+; RV64I-NEXT:    or s1, s4, s3
+; RV64I-NEXT:    lbu s2, 0(a1)
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or s5, s6, s5
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    lbu s3, 4(a1)
+; RV64I-NEXT:    lbu s6, 5(a1)
+; RV64I-NEXT:    or s4, s7, s4
+; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
-; RV64I-NEXT:    or a1, a1, t0
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    slli s6, s6, 8
+; RV64I-NEXT:    or s3, s6, s3
 ; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
 ; RV64I-NEXT:    sd zero, 24(sp)
+; RV64I-NEXT:    slli s7, s7, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, s7
+; RV64I-NEXT:    addi s6, sp, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    or t0, s0, t6
+; RV64I-NEXT:    or t1, s5, s1
+; RV64I-NEXT:    or t2, s4, s2
+; RV64I-NEXT:    or a1, a1, s3
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a5, t1, t0
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    sd a3, 32(sp)
 ; RV64I-NEXT:    sd a4, 40(sp)
-; RV64I-NEXT:    sd a5, 48(sp)
-; RV64I-NEXT:    sd a0, 56(sp)
-; RV64I-NEXT:    slli a0, a1, 2
-; RV64I-NEXT:    andi a0, a0, 24
-; RV64I-NEXT:    addi a3, sp, 32
-; RV64I-NEXT:    sub a3, a3, a0
-; RV64I-NEXT:    ld a5, 0(a3)
-; RV64I-NEXT:    ld a4, 8(a3)
-; RV64I-NEXT:    slli a6, a1, 5
-; RV64I-NEXT:    ld a7, 16(a3)
-; RV64I-NEXT:    ld a1, 24(a3)
-; RV64I-NEXT:    sll a3, a4, a6
-; RV64I-NEXT:    andi a0, a6, 32
-; RV64I-NEXT:    xori t0, a0, 63
-; RV64I-NEXT:    srli a0, a5, 1
-; RV64I-NEXT:    srl a0, a0, t0
-; RV64I-NEXT:    or a0, a3, a0
-; RV64I-NEXT:    sll t1, a1, a6
-; RV64I-NEXT:    srli a1, a7, 1
-; RV64I-NEXT:    srl a1, a1, t0
-; RV64I-NEXT:    or a1, t1, a1
-; RV64I-NEXT:    sll a7, a7, a6
-; RV64I-NEXT:    srli a4, a4, 1
-; RV64I-NEXT:    srl a4, a4, t0
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    sll a5, a5, a6
-; RV64I-NEXT:    srli a6, a7, 56
-; RV64I-NEXT:    srli t0, a7, 48
-; RV64I-NEXT:    srli t2, a7, 40
-; RV64I-NEXT:    srli a7, a7, 32
-; RV64I-NEXT:    sb a7, 20(a2)
+; RV64I-NEXT:    sd a0, 48(sp)
+; RV64I-NEXT:    sd a5, 56(sp)
+; RV64I-NEXT:    slli a3, a1, 5
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    andi a1, a1, 24
+; RV64I-NEXT:    andi a0, a3, 32
+; RV64I-NEXT:    sub a1, s6, a1
+; RV64I-NEXT:    ld a4, 0(a1)
+; RV64I-NEXT:    ld a5, 8(a1)
+; RV64I-NEXT:    ld a6, 16(a1)
+; RV64I-NEXT:    ld a1, 24(a1)
+; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    sll a0, a5, a3
+; RV64I-NEXT:    srli t0, a4, 1
+; RV64I-NEXT:    sll a1, a1, a3
+; RV64I-NEXT:    srli t1, a6, 1
+; RV64I-NEXT:    sll a6, a6, a3
+; RV64I-NEXT:    srli a5, a5, 1
+; RV64I-NEXT:    sll a3, a4, a3
+; RV64I-NEXT:    srl a4, t0, a7
+; RV64I-NEXT:    srl t0, t1, a7
+; RV64I-NEXT:    srl a5, a5, a7
+; RV64I-NEXT:    srli a7, a6, 56
+; RV64I-NEXT:    srli t1, a6, 48
+; RV64I-NEXT:    srli t2, a6, 40
+; RV64I-NEXT:    srli t3, a6, 32
+; RV64I-NEXT:    srli t4, a1, 56
+; RV64I-NEXT:    srli t5, a1, 48
+; RV64I-NEXT:    srli t6, a1, 40
+; RV64I-NEXT:    srli s0, a1, 32
+; RV64I-NEXT:    srli s1, a3, 56
+; RV64I-NEXT:    srli s2, a3, 48
+; RV64I-NEXT:    srli s3, a3, 40
+; RV64I-NEXT:    srli s4, a3, 32
+; RV64I-NEXT:    srli s5, a3, 24
+; RV64I-NEXT:    srli s6, a3, 16
+; RV64I-NEXT:    or a1, a1, t0
+; RV64I-NEXT:    srli t0, a3, 8
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    srli a6, a0, 56
+; RV64I-NEXT:    sb t3, 20(a2)
 ; RV64I-NEXT:    sb t2, 21(a2)
-; RV64I-NEXT:    sb t0, 22(a2)
-; RV64I-NEXT:    sb a6, 23(a2)
-; RV64I-NEXT:    srli a6, t1, 56
-; RV64I-NEXT:    srli a7, t1, 48
-; RV64I-NEXT:    srli t0, t1, 40
-; RV64I-NEXT:    srli t1, t1, 32
-; RV64I-NEXT:    sb t1, 28(a2)
-; RV64I-NEXT:    sb t0, 29(a2)
-; RV64I-NEXT:    sb a7, 30(a2)
-; RV64I-NEXT:    sb a6, 31(a2)
-; RV64I-NEXT:    srli a6, a5, 56
-; RV64I-NEXT:    srli a7, a5, 48
-; RV64I-NEXT:    srli t0, a5, 40
-; RV64I-NEXT:    srli t1, a5, 32
-; RV64I-NEXT:    sb t1, 4(a2)
-; RV64I-NEXT:    sb t0, 5(a2)
-; RV64I-NEXT:    sb a7, 6(a2)
-; RV64I-NEXT:    sb a6, 7(a2)
-; RV64I-NEXT:    srli a6, a5, 24
-; RV64I-NEXT:    srli a7, a5, 16
-; RV64I-NEXT:    srli t0, a5, 8
-; RV64I-NEXT:    sb a5, 0(a2)
+; RV64I-NEXT:    sb t1, 22(a2)
+; RV64I-NEXT:    sb a7, 23(a2)
+; RV64I-NEXT:    srli a7, a0, 48
+; RV64I-NEXT:    sb s0, 28(a2)
+; RV64I-NEXT:    sb t6, 29(a2)
+; RV64I-NEXT:    sb t5, 30(a2)
+; RV64I-NEXT:    sb t4, 31(a2)
+; RV64I-NEXT:    srli t1, a0, 40
+; RV64I-NEXT:    or a4, a0, a4
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    sb s4, 4(a2)
+; RV64I-NEXT:    sb s3, 5(a2)
+; RV64I-NEXT:    sb s2, 6(a2)
+; RV64I-NEXT:    sb s1, 7(a2)
+; RV64I-NEXT:    sb a3, 0(a2)
 ; RV64I-NEXT:    sb t0, 1(a2)
-; RV64I-NEXT:    sb a7, 2(a2)
-; RV64I-NEXT:    sb a6, 3(a2)
-; RV64I-NEXT:    srli a5, a3, 56
-; RV64I-NEXT:    srli a6, a3, 48
-; RV64I-NEXT:    srli a7, a3, 40
-; RV64I-NEXT:    srli a3, a3, 32
-; RV64I-NEXT:    sb a3, 12(a2)
-; RV64I-NEXT:    sb a7, 13(a2)
-; RV64I-NEXT:    sb a6, 14(a2)
-; RV64I-NEXT:    sb a5, 15(a2)
-; RV64I-NEXT:    srli a3, a4, 24
-; RV64I-NEXT:    srli a5, a4, 16
-; RV64I-NEXT:    srli a6, a4, 8
-; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    sb s6, 2(a2)
+; RV64I-NEXT:    sb s5, 3(a2)
+; RV64I-NEXT:    sb a0, 12(a2)
+; RV64I-NEXT:    sb t1, 13(a2)
+; RV64I-NEXT:    sb a7, 14(a2)
+; RV64I-NEXT:    sb a6, 15(a2)
+; RV64I-NEXT:    srli a0, a5, 24
+; RV64I-NEXT:    srli a3, a5, 16
+; RV64I-NEXT:    srli a6, a5, 8
+; RV64I-NEXT:    srli a7, a1, 24
+; RV64I-NEXT:    srli t0, a1, 16
+; RV64I-NEXT:    srli t1, a1, 8
+; RV64I-NEXT:    srli t2, a4, 24
+; RV64I-NEXT:    srli t3, a4, 16
+; RV64I-NEXT:    srli t4, a4, 8
+; RV64I-NEXT:    sb a5, 16(a2)
 ; RV64I-NEXT:    sb a6, 17(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb a3, 19(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a5, a1, 8
+; RV64I-NEXT:    sb a3, 18(a2)
+; RV64I-NEXT:    sb a0, 19(a2)
 ; RV64I-NEXT:    sb a1, 24(a2)
-; RV64I-NEXT:    sb a5, 25(a2)
-; RV64I-NEXT:    sb a4, 26(a2)
-; RV64I-NEXT:    sb a3, 27(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    srli a3, a0, 16
-; RV64I-NEXT:    srli a4, a0, 8
-; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    sb a4, 9(a2)
-; RV64I-NEXT:    sb a3, 10(a2)
-; RV64I-NEXT:    sb a1, 11(a2)
-; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    sb t1, 25(a2)
+; RV64I-NEXT:    sb t0, 26(a2)
+; RV64I-NEXT:    sb a7, 27(a2)
+; RV64I-NEXT:    sb a4, 8(a2)
+; RV64I-NEXT:    sb t4, 9(a2)
+; RV64I-NEXT:    sb t3, 10(a2)
+; RV64I-NEXT:    sb t2, 11(a2)
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_32bytes_wordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a7, 0(a0)
+; RV32I-NEXT:    lbu t0, 1(a0)
+; RV32I-NEXT:    lbu t1, 2(a0)
+; RV32I-NEXT:    lbu s1, 3(a0)
+; RV32I-NEXT:    lbu s7, 4(a0)
+; RV32I-NEXT:    lbu s8, 5(a0)
+; RV32I-NEXT:    lbu s4, 6(a0)
+; RV32I-NEXT:    lbu s6, 7(a0)
+; RV32I-NEXT:    lbu s5, 8(a0)
+; RV32I-NEXT:    lbu s10, 9(a0)
+; RV32I-NEXT:    lbu s11, 10(a0)
+; RV32I-NEXT:    lbu ra, 11(a0)
+; RV32I-NEXT:    lbu t4, 12(a0)
+; RV32I-NEXT:    lbu t6, 13(a0)
+; RV32I-NEXT:    lbu a5, 14(a0)
+; RV32I-NEXT:    lbu a6, 15(a0)
+; RV32I-NEXT:    lbu a3, 16(a0)
+; RV32I-NEXT:    lbu t2, 17(a0)
+; RV32I-NEXT:    lbu t3, 18(a0)
+; RV32I-NEXT:    lbu t5, 19(a0)
+; RV32I-NEXT:    lbu a4, 20(a0)
+; RV32I-NEXT:    lbu s0, 21(a0)
+; RV32I-NEXT:    lbu s2, 22(a0)
+; RV32I-NEXT:    lbu s3, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a6, t1, a6
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 17(a0)
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    lbu a7, 18(a0)
-; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a7, t2, a7
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    lbu t0, 22(a0)
-; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t0, t3, t0
-; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 25(a0)
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    lbu t1, 26(a0)
-; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    or t2, t3, t2
 ; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t1, t4, t1
-; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 29(a0)
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    lbu t2, 30(a0)
+; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, s1, t1
+; RV32I-NEXT:    or t1, s8, s7
+; RV32I-NEXT:    lbu s1, 24(a0)
+; RV32I-NEXT:    lbu s7, 25(a0)
+; RV32I-NEXT:    lbu s8, 26(a0)
+; RV32I-NEXT:    lbu s9, 27(a0)
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    slli s11, s11, 16
+; RV32I-NEXT:    slli ra, ra, 24
+; RV32I-NEXT:    or s4, s6, s4
+; RV32I-NEXT:    or s5, s10, s5
+; RV32I-NEXT:    or s6, ra, s11
+; RV32I-NEXT:    lbu s10, 28(a0)
+; RV32I-NEXT:    lbu s11, 29(a0)
+; RV32I-NEXT:    lbu ra, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t2
-; RV32I-NEXT:    or a0, a0, t3
 ; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 0(sp)
-; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    sw a7, 48(sp)
-; RV32I-NEXT:    sw t0, 52(sp)
-; RV32I-NEXT:    sw t1, 56(sp)
-; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a3, 32(sp)
-; RV32I-NEXT:    sw a4, 36(sp)
-; RV32I-NEXT:    sw a5, 40(sp)
-; RV32I-NEXT:    sw a6, 44(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    or t4, t6, t4
+; RV32I-NEXT:    addi t6, sp, 40
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a3, t2, a3
+; RV32I-NEXT:    or a6, t5, t3
+; RV32I-NEXT:    or a4, s0, a4
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    or t3, s7, s1
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    or s0, s11, s10
+; RV32I-NEXT:    or a0, a0, ra
 ; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    addi a0, sp, 32
-; RV32I-NEXT:    sub a4, a0, a1
-; RV32I-NEXT:    lw a5, 16(a4)
-; RV32I-NEXT:    lw a6, 20(a4)
-; RV32I-NEXT:    lw a7, 24(a4)
-; RV32I-NEXT:    lw a1, 0(a4)
-; RV32I-NEXT:    lw a0, 4(a4)
-; RV32I-NEXT:    lw t0, 8(a4)
-; RV32I-NEXT:    lw a3, 12(a4)
-; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, s4, t1
+; RV32I-NEXT:    or t1, s6, s5
+; RV32I-NEXT:    or a5, a5, t4
+; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    or a4, t2, a4
+; RV32I-NEXT:    or a6, t5, t3
+; RV32I-NEXT:    or a0, a0, s0
+; RV32I-NEXT:    sub t2, t6, a1
+; RV32I-NEXT:    sw a3, 56(sp)
+; RV32I-NEXT:    sw a4, 60(sp)
+; RV32I-NEXT:    sw a6, 64(sp)
+; RV32I-NEXT:    sw a0, 68(sp)
+; RV32I-NEXT:    sw a7, 40(sp)
+; RV32I-NEXT:    sw t0, 44(sp)
+; RV32I-NEXT:    sw t1, 48(sp)
+; RV32I-NEXT:    sw a5, 52(sp)
+; RV32I-NEXT:    lw a6, 16(t2)
+; RV32I-NEXT:    lw a5, 20(t2)
+; RV32I-NEXT:    lw a7, 24(t2)
+; RV32I-NEXT:    lw a1, 0(t2)
+; RV32I-NEXT:    lw a0, 4(t2)
+; RV32I-NEXT:    lw a4, 8(t2)
+; RV32I-NEXT:    lw a3, 12(t2)
+; RV32I-NEXT:    lw t0, 28(t2)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
+; RV32I-NEXT:    srli t4, t0, 24
+; RV32I-NEXT:    srli t5, t0, 16
+; RV32I-NEXT:    srli t6, t0, 8
+; RV32I-NEXT:    srli s0, a6, 24
+; RV32I-NEXT:    srli s1, a6, 16
+; RV32I-NEXT:    srli s2, a6, 8
+; RV32I-NEXT:    srli s3, a5, 24
+; RV32I-NEXT:    srli s4, a5, 16
+; RV32I-NEXT:    srli s5, a5, 8
+; RV32I-NEXT:    srli s6, a4, 24
+; RV32I-NEXT:    srli s7, a4, 16
+; RV32I-NEXT:    srli s8, a4, 8
+; RV32I-NEXT:    srli s9, a3, 24
+; RV32I-NEXT:    srli s10, a3, 16
+; RV32I-NEXT:    srli s11, a3, 8
+; RV32I-NEXT:    srli ra, a1, 24
 ; RV32I-NEXT:    sb a7, 24(a2)
 ; RV32I-NEXT:    sb t3, 25(a2)
 ; RV32I-NEXT:    sb t2, 26(a2)
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli a7, a4, 24
-; RV32I-NEXT:    srli t1, a4, 16
-; RV32I-NEXT:    srli t2, a4, 8
-; RV32I-NEXT:    sb a4, 28(a2)
-; RV32I-NEXT:    sb t2, 29(a2)
-; RV32I-NEXT:    sb t1, 30(a2)
-; RV32I-NEXT:    sb a7, 31(a2)
-; RV32I-NEXT:    srli a4, a5, 24
-; RV32I-NEXT:    srli a7, a5, 16
-; RV32I-NEXT:    srli t1, a5, 8
-; RV32I-NEXT:    sb a5, 16(a2)
-; RV32I-NEXT:    sb t1, 17(a2)
-; RV32I-NEXT:    sb a7, 18(a2)
-; RV32I-NEXT:    sb a4, 19(a2)
-; RV32I-NEXT:    srli a4, a6, 24
-; RV32I-NEXT:    srli a5, a6, 16
-; RV32I-NEXT:    srli a7, a6, 8
-; RV32I-NEXT:    sb a6, 20(a2)
-; RV32I-NEXT:    sb a7, 21(a2)
-; RV32I-NEXT:    sb a5, 22(a2)
-; RV32I-NEXT:    sb a4, 23(a2)
-; RV32I-NEXT:    srli a4, t0, 24
-; RV32I-NEXT:    srli a5, t0, 16
-; RV32I-NEXT:    srli a6, t0, 8
-; RV32I-NEXT:    sb t0, 8(a2)
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    sb a5, 10(a2)
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    srli a5, a3, 16
-; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    srli a7, a1, 16
+; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    sb t6, 29(a2)
+; RV32I-NEXT:    sb t5, 30(a2)
+; RV32I-NEXT:    sb t4, 31(a2)
+; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    sb a6, 16(a2)
+; RV32I-NEXT:    sb s2, 17(a2)
+; RV32I-NEXT:    sb s1, 18(a2)
+; RV32I-NEXT:    sb s0, 19(a2)
+; RV32I-NEXT:    srli a6, a0, 24
+; RV32I-NEXT:    sb a5, 20(a2)
+; RV32I-NEXT:    sb s5, 21(a2)
+; RV32I-NEXT:    sb s4, 22(a2)
+; RV32I-NEXT:    sb s3, 23(a2)
+; RV32I-NEXT:    srli a5, a0, 16
+; RV32I-NEXT:    sb a4, 8(a2)
+; RV32I-NEXT:    sb s8, 9(a2)
+; RV32I-NEXT:    sb s7, 10(a2)
+; RV32I-NEXT:    sb s6, 11(a2)
+; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a6, 13(a2)
-; RV32I-NEXT:    sb a5, 14(a2)
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    srli a4, a1, 16
-; RV32I-NEXT:    srli a5, a1, 8
+; RV32I-NEXT:    sb s11, 13(a2)
+; RV32I-NEXT:    sb s10, 14(a2)
+; RV32I-NEXT:    sb s9, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    sb a4, 2(a2)
-; RV32I-NEXT:    sb a3, 3(a2)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    sb t0, 1(a2)
+; RV32I-NEXT:    sb a7, 2(a2)
+; RV32I-NEXT:    sb ra, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    sb a5, 6(a2)
+; RV32I-NEXT:    sb a6, 7(a2)
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -3964,344 +4202,394 @@ define void @shl_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounw
 define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_32bytes_dwordOff:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -64
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
-; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 10(a0)
-; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    lbu a5, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 17(a0)
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 18(a0)
-; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a5, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
+; RV64I-NEXT:    lbu t2, 2(a0)
+; RV64I-NEXT:    lbu s3, 3(a0)
+; RV64I-NEXT:    lbu t0, 4(a0)
+; RV64I-NEXT:    lbu s8, 5(a0)
+; RV64I-NEXT:    lbu s9, 6(a0)
+; RV64I-NEXT:    lbu s10, 7(a0)
+; RV64I-NEXT:    lbu s2, 8(a0)
+; RV64I-NEXT:    lbu s4, 9(a0)
+; RV64I-NEXT:    lbu s5, 10(a0)
+; RV64I-NEXT:    lbu s6, 11(a0)
+; RV64I-NEXT:    lbu s7, 12(a0)
+; RV64I-NEXT:    lbu s11, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t3, 15(a0)
+; RV64I-NEXT:    lbu a3, 16(a0)
+; RV64I-NEXT:    lbu a6, 17(a0)
+; RV64I-NEXT:    lbu t4, 18(a0)
+; RV64I-NEXT:    lbu t5, 19(a0)
+; RV64I-NEXT:    lbu a4, 20(a0)
+; RV64I-NEXT:    lbu t6, 21(a0)
+; RV64I-NEXT:    lbu s0, 22(a0)
+; RV64I-NEXT:    lbu s1, 23(a0)
 ; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 21(a0)
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    lbu a6, 22(a0)
-; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 29(a0)
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    lbu a7, 30(a0)
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    slli s9, s9, 16
+; RV64I-NEXT:    slli s10, s10, 24
+; RV64I-NEXT:    or a5, a7, a5
+; RV64I-NEXT:    or a7, s3, t2
+; RV64I-NEXT:    or t0, s8, t0
+; RV64I-NEXT:    or t2, s10, s9
+; RV64I-NEXT:    lbu s3, 24(a0)
+; RV64I-NEXT:    lbu s8, 25(a0)
+; RV64I-NEXT:    lbu s9, 26(a0)
+; RV64I-NEXT:    lbu s10, 27(a0)
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s11, s11, 8
+; RV64I-NEXT:    or s2, s4, s2
+; RV64I-NEXT:    or s4, s6, s5
+; RV64I-NEXT:    or s5, s11, s7
+; RV64I-NEXT:    lbu s6, 28(a0)
+; RV64I-NEXT:    lbu s7, 29(a0)
+; RV64I-NEXT:    lbu s11, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    lbu a1, 0(a1)
 ; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
 ; RV64I-NEXT:    sd zero, 24(sp)
-; RV64I-NEXT:    sd a3, 32(sp)
-; RV64I-NEXT:    sd a4, 40(sp)
-; RV64I-NEXT:    sd a5, 48(sp)
-; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or t1, t3, t1
+; RV64I-NEXT:    addi t3, sp, 32
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    slli t5, t5, 24
+; RV64I-NEXT:    slli t6, t6, 8
+; RV64I-NEXT:    slli s0, s0, 16
+; RV64I-NEXT:    slli s1, s1, 24
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    slli s9, s9, 16
+; RV64I-NEXT:    slli s10, s10, 24
+; RV64I-NEXT:    slli s7, s7, 8
+; RV64I-NEXT:    slli s11, s11, 16
+; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a6, t5, t4
+; RV64I-NEXT:    or a4, t6, a4
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    or t4, s8, s3
+; RV64I-NEXT:    or t5, s10, s9
+; RV64I-NEXT:    or t6, s7, s6
+; RV64I-NEXT:    or a0, a0, s11
 ; RV64I-NEXT:    andi a1, a1, 24
-; RV64I-NEXT:    addi a0, sp, 32
-; RV64I-NEXT:    sub a3, a0, a1
-; RV64I-NEXT:    ld a4, 16(a3)
-; RV64I-NEXT:    ld a0, 8(a3)
-; RV64I-NEXT:    ld a1, 0(a3)
-; RV64I-NEXT:    ld a3, 24(a3)
+; RV64I-NEXT:    or a5, a7, a5
+; RV64I-NEXT:    or a7, t2, t0
+; RV64I-NEXT:    or t0, s4, s2
+; RV64I-NEXT:    or t1, t1, s5
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a4, s0, a4
+; RV64I-NEXT:    or a6, t5, t4
+; RV64I-NEXT:    or a0, a0, t6
+; RV64I-NEXT:    sub t2, t3, a1
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    or a1, a7, a5
+; RV64I-NEXT:    or a5, t1, t0
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a0, a0, a6
+; RV64I-NEXT:    sd a1, 32(sp)
+; RV64I-NEXT:    sd a5, 40(sp)
+; RV64I-NEXT:    sd a3, 48(sp)
+; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    ld a4, 16(t2)
+; RV64I-NEXT:    ld a0, 8(t2)
+; RV64I-NEXT:    ld a1, 0(t2)
+; RV64I-NEXT:    ld a3, 24(t2)
 ; RV64I-NEXT:    srli a5, a4, 56
 ; RV64I-NEXT:    srli a6, a4, 48
 ; RV64I-NEXT:    srli a7, a4, 40
 ; RV64I-NEXT:    srli t0, a4, 32
+; RV64I-NEXT:    srli t1, a4, 24
+; RV64I-NEXT:    srli t2, a4, 16
+; RV64I-NEXT:    srli t3, a4, 8
+; RV64I-NEXT:    srli t4, a3, 56
+; RV64I-NEXT:    srli t5, a3, 48
+; RV64I-NEXT:    srli t6, a3, 40
+; RV64I-NEXT:    srli s0, a3, 32
+; RV64I-NEXT:    srli s1, a3, 24
+; RV64I-NEXT:    srli s2, a3, 16
+; RV64I-NEXT:    srli s3, a3, 8
+; RV64I-NEXT:    srli s4, a1, 56
+; RV64I-NEXT:    srli s5, a1, 48
+; RV64I-NEXT:    srli s6, a1, 40
+; RV64I-NEXT:    srli s7, a1, 32
+; RV64I-NEXT:    srli s8, a1, 24
+; RV64I-NEXT:    srli s9, a1, 16
+; RV64I-NEXT:    srli s10, a1, 8
+; RV64I-NEXT:    srli s11, a0, 56
 ; RV64I-NEXT:    sb t0, 20(a2)
 ; RV64I-NEXT:    sb a7, 21(a2)
 ; RV64I-NEXT:    sb a6, 22(a2)
 ; RV64I-NEXT:    sb a5, 23(a2)
-; RV64I-NEXT:    srli a5, a4, 24
-; RV64I-NEXT:    srli a6, a4, 16
-; RV64I-NEXT:    srli a7, a4, 8
+; RV64I-NEXT:    srli a5, a0, 48
 ; RV64I-NEXT:    sb a4, 16(a2)
-; RV64I-NEXT:    sb a7, 17(a2)
-; RV64I-NEXT:    sb a6, 18(a2)
-; RV64I-NEXT:    sb a5, 19(a2)
-; RV64I-NEXT:    srli a4, a3, 56
-; RV64I-NEXT:    srli a5, a3, 48
-; RV64I-NEXT:    srli a6, a3, 40
-; RV64I-NEXT:    srli a7, a3, 32
-; RV64I-NEXT:    sb a7, 28(a2)
-; RV64I-NEXT:    sb a6, 29(a2)
-; RV64I-NEXT:    sb a5, 30(a2)
-; RV64I-NEXT:    sb a4, 31(a2)
-; RV64I-NEXT:    srli a4, a3, 24
-; RV64I-NEXT:    srli a5, a3, 16
-; RV64I-NEXT:    srli a6, a3, 8
+; RV64I-NEXT:    sb t3, 17(a2)
+; RV64I-NEXT:    sb t2, 18(a2)
+; RV64I-NEXT:    sb t1, 19(a2)
+; RV64I-NEXT:    srli a4, a0, 40
+; RV64I-NEXT:    sb s0, 28(a2)
+; RV64I-NEXT:    sb t6, 29(a2)
+; RV64I-NEXT:    sb t5, 30(a2)
+; RV64I-NEXT:    sb t4, 31(a2)
+; RV64I-NEXT:    srli a6, a0, 32
 ; RV64I-NEXT:    sb a3, 24(a2)
-; RV64I-NEXT:    sb a6, 25(a2)
-; RV64I-NEXT:    sb a5, 26(a2)
-; RV64I-NEXT:    sb a4, 27(a2)
-; RV64I-NEXT:    srli a3, a1, 56
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    srli a6, a1, 32
-; RV64I-NEXT:    sb a6, 4(a2)
-; RV64I-NEXT:    sb a5, 5(a2)
-; RV64I-NEXT:    sb a4, 6(a2)
-; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a5, a1, 8
+; RV64I-NEXT:    sb s3, 25(a2)
+; RV64I-NEXT:    sb s2, 26(a2)
+; RV64I-NEXT:    sb s1, 27(a2)
+; RV64I-NEXT:    srli a3, a0, 24
+; RV64I-NEXT:    sb s7, 4(a2)
+; RV64I-NEXT:    sb s6, 5(a2)
+; RV64I-NEXT:    sb s5, 6(a2)
+; RV64I-NEXT:    sb s4, 7(a2)
+; RV64I-NEXT:    srli a7, a0, 16
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb a5, 1(a2)
-; RV64I-NEXT:    sb a4, 2(a2)
-; RV64I-NEXT:    sb a3, 3(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 12(a2)
+; RV64I-NEXT:    sb s10, 1(a2)
+; RV64I-NEXT:    sb s9, 2(a2)
+; RV64I-NEXT:    sb s8, 3(a2)
+; RV64I-NEXT:    srli a1, a0, 8
+; RV64I-NEXT:    sb a6, 12(a2)
 ; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    sb a3, 14(a2)
-; RV64I-NEXT:    sb a1, 15(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    srli a3, a0, 16
-; RV64I-NEXT:    srli a4, a0, 8
+; RV64I-NEXT:    sb a5, 14(a2)
+; RV64I-NEXT:    sb s11, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    sb a4, 9(a2)
-; RV64I-NEXT:    sb a3, 10(a2)
-; RV64I-NEXT:    sb a1, 11(a2)
-; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    sb a7, 10(a2)
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_32bytes_dwordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a7, 0(a0)
+; RV32I-NEXT:    lbu t0, 1(a0)
+; RV32I-NEXT:    lbu t1, 2(a0)
+; RV32I-NEXT:    lbu s1, 3(a0)
+; RV32I-NEXT:    lbu s7, 4(a0)
+; RV32I-NEXT:    lbu s8, 5(a0)
+; RV32I-NEXT:    lbu s4, 6(a0)
+; RV32I-NEXT:    lbu s6, 7(a0)
+; RV32I-NEXT:    lbu s5, 8(a0)
+; RV32I-NEXT:    lbu s10, 9(a0)
+; RV32I-NEXT:    lbu s11, 10(a0)
+; RV32I-NEXT:    lbu ra, 11(a0)
+; RV32I-NEXT:    lbu t4, 12(a0)
+; RV32I-NEXT:    lbu t6, 13(a0)
+; RV32I-NEXT:    lbu a5, 14(a0)
+; RV32I-NEXT:    lbu a6, 15(a0)
+; RV32I-NEXT:    lbu a3, 16(a0)
+; RV32I-NEXT:    lbu t2, 17(a0)
+; RV32I-NEXT:    lbu t3, 18(a0)
+; RV32I-NEXT:    lbu t5, 19(a0)
+; RV32I-NEXT:    lbu a4, 20(a0)
+; RV32I-NEXT:    lbu s0, 21(a0)
+; RV32I-NEXT:    lbu s2, 22(a0)
+; RV32I-NEXT:    lbu s3, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a6, t1, a6
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 17(a0)
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    lbu a7, 18(a0)
-; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a7, t2, a7
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    lbu t0, 22(a0)
-; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t0, t3, t0
-; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 25(a0)
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    lbu t1, 26(a0)
-; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    or t2, t3, t2
 ; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t1, t4, t1
-; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 29(a0)
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    lbu t2, 30(a0)
+; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, s1, t1
+; RV32I-NEXT:    or t1, s8, s7
+; RV32I-NEXT:    lbu s1, 24(a0)
+; RV32I-NEXT:    lbu s7, 25(a0)
+; RV32I-NEXT:    lbu s8, 26(a0)
+; RV32I-NEXT:    lbu s9, 27(a0)
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    slli s11, s11, 16
+; RV32I-NEXT:    slli ra, ra, 24
+; RV32I-NEXT:    or s4, s6, s4
+; RV32I-NEXT:    or s5, s10, s5
+; RV32I-NEXT:    or s6, ra, s11
+; RV32I-NEXT:    lbu s10, 28(a0)
+; RV32I-NEXT:    lbu s11, 29(a0)
+; RV32I-NEXT:    lbu ra, 30(a0)
 ; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, t2
-; RV32I-NEXT:    or a0, a0, t3
 ; RV32I-NEXT:    lbu a1, 0(a1)
-; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 0(sp)
-; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    sw a7, 48(sp)
-; RV32I-NEXT:    sw t0, 52(sp)
-; RV32I-NEXT:    sw t1, 56(sp)
-; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a3, 32(sp)
-; RV32I-NEXT:    sw a4, 36(sp)
-; RV32I-NEXT:    sw a5, 40(sp)
-; RV32I-NEXT:    sw a6, 44(sp)
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    slli t6, t6, 8
+; RV32I-NEXT:    or t4, t6, t4
+; RV32I-NEXT:    addi t6, sp, 40
+; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    slli t2, t2, 8
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a3, t2, a3
+; RV32I-NEXT:    or a6, t5, t3
+; RV32I-NEXT:    or a4, s0, a4
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    or t3, s7, s1
+; RV32I-NEXT:    or t5, s9, s8
+; RV32I-NEXT:    or s0, s11, s10
+; RV32I-NEXT:    or a0, a0, ra
 ; RV32I-NEXT:    andi a1, a1, 24
-; RV32I-NEXT:    addi a0, sp, 32
-; RV32I-NEXT:    sub a4, a0, a1
-; RV32I-NEXT:    lw a5, 16(a4)
-; RV32I-NEXT:    lw a6, 20(a4)
-; RV32I-NEXT:    lw a7, 24(a4)
-; RV32I-NEXT:    lw a1, 0(a4)
-; RV32I-NEXT:    lw a0, 4(a4)
-; RV32I-NEXT:    lw t0, 8(a4)
-; RV32I-NEXT:    lw a3, 12(a4)
-; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, s4, t1
+; RV32I-NEXT:    or t1, s6, s5
+; RV32I-NEXT:    or a5, a5, t4
+; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    or a4, t2, a4
+; RV32I-NEXT:    or a6, t5, t3
+; RV32I-NEXT:    or a0, a0, s0
+; RV32I-NEXT:    sub t2, t6, a1
+; RV32I-NEXT:    sw a3, 56(sp)
+; RV32I-NEXT:    sw a4, 60(sp)
+; RV32I-NEXT:    sw a6, 64(sp)
+; RV32I-NEXT:    sw a0, 68(sp)
+; RV32I-NEXT:    sw a7, 40(sp)
+; RV32I-NEXT:    sw t0, 44(sp)
+; RV32I-NEXT:    sw t1, 48(sp)
+; RV32I-NEXT:    sw a5, 52(sp)
+; RV32I-NEXT:    lw a6, 16(t2)
+; RV32I-NEXT:    lw a5, 20(t2)
+; RV32I-NEXT:    lw a7, 24(t2)
+; RV32I-NEXT:    lw a1, 0(t2)
+; RV32I-NEXT:    lw a0, 4(t2)
+; RV32I-NEXT:    lw a4, 8(t2)
+; RV32I-NEXT:    lw a3, 12(t2)
+; RV32I-NEXT:    lw t0, 28(t2)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
+; RV32I-NEXT:    srli t4, t0, 24
+; RV32I-NEXT:    srli t5, t0, 16
+; RV32I-NEXT:    srli t6, t0, 8
+; RV32I-NEXT:    srli s0, a6, 24
+; RV32I-NEXT:    srli s1, a6, 16
+; RV32I-NEXT:    srli s2, a6, 8
+; RV32I-NEXT:    srli s3, a5, 24
+; RV32I-NEXT:    srli s4, a5, 16
+; RV32I-NEXT:    srli s5, a5, 8
+; RV32I-NEXT:    srli s6, a4, 24
+; RV32I-NEXT:    srli s7, a4, 16
+; RV32I-NEXT:    srli s8, a4, 8
+; RV32I-NEXT:    srli s9, a3, 24
+; RV32I-NEXT:    srli s10, a3, 16
+; RV32I-NEXT:    srli s11, a3, 8
+; RV32I-NEXT:    srli ra, a1, 24
 ; RV32I-NEXT:    sb a7, 24(a2)
 ; RV32I-NEXT:    sb t3, 25(a2)
 ; RV32I-NEXT:    sb t2, 26(a2)
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli a7, a4, 24
-; RV32I-NEXT:    srli t1, a4, 16
-; RV32I-NEXT:    srli t2, a4, 8
-; RV32I-NEXT:    sb a4, 28(a2)
-; RV32I-NEXT:    sb t2, 29(a2)
-; RV32I-NEXT:    sb t1, 30(a2)
-; RV32I-NEXT:    sb a7, 31(a2)
-; RV32I-NEXT:    srli a4, a5, 24
-; RV32I-NEXT:    srli a7, a5, 16
-; RV32I-NEXT:    srli t1, a5, 8
-; RV32I-NEXT:    sb a5, 16(a2)
-; RV32I-NEXT:    sb t1, 17(a2)
-; RV32I-NEXT:    sb a7, 18(a2)
-; RV32I-NEXT:    sb a4, 19(a2)
-; RV32I-NEXT:    srli a4, a6, 24
-; RV32I-NEXT:    srli a5, a6, 16
-; RV32I-NEXT:    srli a7, a6, 8
-; RV32I-NEXT:    sb a6, 20(a2)
-; RV32I-NEXT:    sb a7, 21(a2)
-; RV32I-NEXT:    sb a5, 22(a2)
-; RV32I-NEXT:    sb a4, 23(a2)
-; RV32I-NEXT:    srli a4, t0, 24
-; RV32I-NEXT:    srli a5, t0, 16
-; RV32I-NEXT:    srli a6, t0, 8
-; RV32I-NEXT:    sb t0, 8(a2)
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    sb a5, 10(a2)
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    srli a5, a3, 16
-; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    srli a7, a1, 16
+; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    sb t6, 29(a2)
+; RV32I-NEXT:    sb t5, 30(a2)
+; RV32I-NEXT:    sb t4, 31(a2)
+; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    sb a6, 16(a2)
+; RV32I-NEXT:    sb s2, 17(a2)
+; RV32I-NEXT:    sb s1, 18(a2)
+; RV32I-NEXT:    sb s0, 19(a2)
+; RV32I-NEXT:    srli a6, a0, 24
+; RV32I-NEXT:    sb a5, 20(a2)
+; RV32I-NEXT:    sb s5, 21(a2)
+; RV32I-NEXT:    sb s4, 22(a2)
+; RV32I-NEXT:    sb s3, 23(a2)
+; RV32I-NEXT:    srli a5, a0, 16
+; RV32I-NEXT:    sb a4, 8(a2)
+; RV32I-NEXT:    sb s8, 9(a2)
+; RV32I-NEXT:    sb s7, 10(a2)
+; RV32I-NEXT:    sb s6, 11(a2)
+; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a6, 13(a2)
-; RV32I-NEXT:    sb a5, 14(a2)
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    srli a4, a1, 16
-; RV32I-NEXT:    srli a5, a1, 8
+; RV32I-NEXT:    sb s11, 13(a2)
+; RV32I-NEXT:    sb s10, 14(a2)
+; RV32I-NEXT:    sb s9, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    sb a4, 2(a2)
-; RV32I-NEXT:    sb a3, 3(a2)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    sb t0, 1(a2)
+; RV32I-NEXT:    sb a7, 2(a2)
+; RV32I-NEXT:    sb ra, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    sb a5, 6(a2)
+; RV32I-NEXT:    sb a6, 7(a2)
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %dwordOff = load i256, ptr %dwordOff.ptr, align 1
@@ -4314,430 +4602,474 @@ define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nou
 define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -64
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 10(a0)
-; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    lbu a5, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 17(a0)
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 18(a0)
-; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 21(a0)
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    lbu a6, 22(a0)
-; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu t1, 27(a0)
 ; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 29(a0)
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    lbu a7, 30(a0)
-; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    slli a7, a0, 32
-; RV64I-NEXT:    lbu t0, 0(a1)
-; RV64I-NEXT:    lbu t1, 1(a1)
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 2(a1)
-; RV64I-NEXT:    lbu t2, 3(a1)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or a7, t2, a7
-; RV64I-NEXT:    lbu t1, 4(a1)
-; RV64I-NEXT:    lbu t2, 5(a1)
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    lbu t0, 6(a1)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    slli s2, s2, 16
+; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t6, 24(a0)
+; RV64I-NEXT:    lbu s0, 25(a0)
+; RV64I-NEXT:    lbu s1, 26(a0)
+; RV64I-NEXT:    lbu s2, 27(a0)
+; RV64I-NEXT:    slli s5, s5, 8
+; RV64I-NEXT:    slli s6, s6, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or t3, s5, s4
+; RV64I-NEXT:    or t4, s7, s6
+; RV64I-NEXT:    or t5, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
+; RV64I-NEXT:    lbu s6, 31(a0)
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s11, s10
+; RV64I-NEXT:    or t6, s0, t6
+; RV64I-NEXT:    or s0, s2, s1
+; RV64I-NEXT:    or s1, s4, s3
+; RV64I-NEXT:    lbu s2, 0(a1)
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or s5, s6, s5
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    lbu s3, 4(a1)
+; RV64I-NEXT:    lbu s6, 5(a1)
+; RV64I-NEXT:    or s4, s7, s4
+; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or t1, t2, t1
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli s6, s6, 8
+; RV64I-NEXT:    or s3, s6, s3
+; RV64I-NEXT:    slli s7, s7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t0
-; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, s7
+; RV64I-NEXT:    mv s6, sp
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    or t0, s0, t6
+; RV64I-NEXT:    or t1, s5, s1
+; RV64I-NEXT:    or t2, s4, s2
+; RV64I-NEXT:    or a1, a1, s3
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    slli t3, t1, 32
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a1, a1, a7
-; RV64I-NEXT:    sraiw a0, a0, 31
-; RV64I-NEXT:    sd a0, 32(sp)
-; RV64I-NEXT:    sd a0, 40(sp)
-; RV64I-NEXT:    sd a0, 48(sp)
-; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sraiw t1, t1, 31
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a5, t3, t0
+; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    sd t1, 32(sp)
+; RV64I-NEXT:    sd t1, 40(sp)
+; RV64I-NEXT:    sd t1, 48(sp)
+; RV64I-NEXT:    sd t1, 56(sp)
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    sd a4, 8(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
-; RV64I-NEXT:    sd a6, 24(sp)
-; RV64I-NEXT:    andi a0, a1, 24
-; RV64I-NEXT:    mv a3, sp
-; RV64I-NEXT:    add a0, a3, a0
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    ld a4, 8(a0)
-; RV64I-NEXT:    slli a5, a1, 3
-; RV64I-NEXT:    ld a6, 16(a0)
-; RV64I-NEXT:    ld a7, 24(a0)
-; RV64I-NEXT:    srl a0, a4, a5
-; RV64I-NEXT:    andi a1, a5, 56
-; RV64I-NEXT:    xori t0, a1, 63
-; RV64I-NEXT:    slli a1, a6, 1
-; RV64I-NEXT:    sll a1, a1, t0
-; RV64I-NEXT:    or a1, a0, a1
-; RV64I-NEXT:    srl a3, a3, a5
-; RV64I-NEXT:    slli a4, a4, 1
-; RV64I-NEXT:    sll a4, a4, t0
-; RV64I-NEXT:    or a4, a3, a4
-; RV64I-NEXT:    srl a6, a6, a5
-; RV64I-NEXT:    slli t1, a7, 1
-; RV64I-NEXT:    sll t0, t1, t0
-; RV64I-NEXT:    or t0, a6, t0
-; RV64I-NEXT:    sra a5, a7, a5
-; RV64I-NEXT:    srli a7, a5, 56
-; RV64I-NEXT:    srli t1, a5, 48
-; RV64I-NEXT:    srli t2, a5, 40
-; RV64I-NEXT:    srli t3, a5, 32
+; RV64I-NEXT:    sd a0, 16(sp)
+; RV64I-NEXT:    sd a5, 24(sp)
+; RV64I-NEXT:    slli a4, a1, 3
+; RV64I-NEXT:    andi a1, a1, 24
+; RV64I-NEXT:    add a1, s6, a1
+; RV64I-NEXT:    andi a0, a4, 56
+; RV64I-NEXT:    ld a3, 0(a1)
+; RV64I-NEXT:    ld a5, 8(a1)
+; RV64I-NEXT:    ld a6, 16(a1)
+; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    ld t0, 24(a1)
+; RV64I-NEXT:    srl a0, a5, a4
+; RV64I-NEXT:    slli t1, a6, 1
+; RV64I-NEXT:    srl a1, a3, a4
+; RV64I-NEXT:    slli a5, a5, 1
+; RV64I-NEXT:    srl a3, a6, a4
+; RV64I-NEXT:    slli a6, t0, 1
+; RV64I-NEXT:    sra t0, t0, a4
+; RV64I-NEXT:    sll a4, t1, a7
+; RV64I-NEXT:    sll a5, a5, a7
+; RV64I-NEXT:    sll a6, a6, a7
+; RV64I-NEXT:    srli a7, t0, 56
+; RV64I-NEXT:    srli t1, t0, 48
+; RV64I-NEXT:    srli t2, t0, 40
+; RV64I-NEXT:    srli t3, t0, 32
+; RV64I-NEXT:    srli t4, t0, 24
+; RV64I-NEXT:    srli t5, t0, 16
+; RV64I-NEXT:    srli t6, t0, 8
+; RV64I-NEXT:    or a4, a0, a4
+; RV64I-NEXT:    or a5, a1, a5
+; RV64I-NEXT:    or a6, a3, a6
 ; RV64I-NEXT:    sb t3, 28(a2)
 ; RV64I-NEXT:    sb t2, 29(a2)
 ; RV64I-NEXT:    sb t1, 30(a2)
 ; RV64I-NEXT:    sb a7, 31(a2)
-; RV64I-NEXT:    srli a7, a5, 24
-; RV64I-NEXT:    srli t1, a5, 16
-; RV64I-NEXT:    srli t2, a5, 8
-; RV64I-NEXT:    sb a5, 24(a2)
-; RV64I-NEXT:    sb t2, 25(a2)
-; RV64I-NEXT:    sb t1, 26(a2)
-; RV64I-NEXT:    sb a7, 27(a2)
-; RV64I-NEXT:    srli a5, t0, 56
-; RV64I-NEXT:    srli a7, t0, 48
-; RV64I-NEXT:    srli t1, t0, 40
-; RV64I-NEXT:    srli t2, t0, 32
+; RV64I-NEXT:    sb t0, 24(a2)
+; RV64I-NEXT:    sb t6, 25(a2)
+; RV64I-NEXT:    sb t5, 26(a2)
+; RV64I-NEXT:    sb t4, 27(a2)
+; RV64I-NEXT:    srli a7, a6, 56
+; RV64I-NEXT:    srli t0, a6, 48
+; RV64I-NEXT:    srli t1, a6, 40
+; RV64I-NEXT:    srli t2, a6, 32
+; RV64I-NEXT:    srli t3, a6, 24
+; RV64I-NEXT:    srli t4, a6, 16
+; RV64I-NEXT:    srli a6, a6, 8
+; RV64I-NEXT:    srli t5, a5, 56
+; RV64I-NEXT:    srli t6, a5, 48
+; RV64I-NEXT:    srli s0, a5, 40
+; RV64I-NEXT:    srli s1, a5, 32
+; RV64I-NEXT:    srli s2, a5, 24
+; RV64I-NEXT:    srli s3, a5, 16
+; RV64I-NEXT:    srli a5, a5, 8
+; RV64I-NEXT:    srli s4, a4, 56
+; RV64I-NEXT:    srli s5, a4, 48
+; RV64I-NEXT:    srli s6, a4, 40
 ; RV64I-NEXT:    sb t2, 20(a2)
 ; RV64I-NEXT:    sb t1, 21(a2)
-; RV64I-NEXT:    sb a7, 22(a2)
-; RV64I-NEXT:    sb a5, 23(a2)
-; RV64I-NEXT:    srli a5, t0, 24
-; RV64I-NEXT:    srli a7, t0, 16
-; RV64I-NEXT:    srli t0, t0, 8
-; RV64I-NEXT:    sb a6, 16(a2)
-; RV64I-NEXT:    sb t0, 17(a2)
-; RV64I-NEXT:    sb a7, 18(a2)
-; RV64I-NEXT:    sb a5, 19(a2)
-; RV64I-NEXT:    srli a5, a4, 56
-; RV64I-NEXT:    srli a6, a4, 48
-; RV64I-NEXT:    srli a7, a4, 40
-; RV64I-NEXT:    srli t0, a4, 32
-; RV64I-NEXT:    sb t0, 4(a2)
-; RV64I-NEXT:    sb a7, 5(a2)
-; RV64I-NEXT:    sb a6, 6(a2)
-; RV64I-NEXT:    sb a5, 7(a2)
-; RV64I-NEXT:    srli a5, a4, 24
+; RV64I-NEXT:    sb t0, 22(a2)
+; RV64I-NEXT:    sb a7, 23(a2)
+; RV64I-NEXT:    srli a7, a4, 32
+; RV64I-NEXT:    sb a3, 16(a2)
+; RV64I-NEXT:    sb a6, 17(a2)
+; RV64I-NEXT:    sb t4, 18(a2)
+; RV64I-NEXT:    sb t3, 19(a2)
+; RV64I-NEXT:    srli a3, a4, 24
+; RV64I-NEXT:    sb s1, 4(a2)
+; RV64I-NEXT:    sb s0, 5(a2)
+; RV64I-NEXT:    sb t6, 6(a2)
+; RV64I-NEXT:    sb t5, 7(a2)
 ; RV64I-NEXT:    srli a6, a4, 16
 ; RV64I-NEXT:    srli a4, a4, 8
-; RV64I-NEXT:    sb a3, 0(a2)
-; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    sb a6, 2(a2)
-; RV64I-NEXT:    sb a5, 3(a2)
-; RV64I-NEXT:    srli a3, a1, 56
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    srli a6, a1, 32
-; RV64I-NEXT:    sb a6, 12(a2)
-; RV64I-NEXT:    sb a5, 13(a2)
-; RV64I-NEXT:    sb a4, 14(a2)
-; RV64I-NEXT:    sb a3, 15(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a1, a1, 8
+; RV64I-NEXT:    sb a1, 0(a2)
+; RV64I-NEXT:    sb a5, 1(a2)
+; RV64I-NEXT:    sb s3, 2(a2)
+; RV64I-NEXT:    sb s2, 3(a2)
+; RV64I-NEXT:    sb a7, 12(a2)
+; RV64I-NEXT:    sb s6, 13(a2)
+; RV64I-NEXT:    sb s5, 14(a2)
+; RV64I-NEXT:    sb s4, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    sb a4, 10(a2)
+; RV64I-NEXT:    sb a4, 9(a2)
+; RV64I-NEXT:    sb a6, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -80
-; RV32I-NEXT:    sw s0, 76(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 72(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 68(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s3, 64(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu t6, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu t1, 4(a0)
+; RV32I-NEXT:    lbu t3, 5(a0)
+; RV32I-NEXT:    lbu t4, 6(a0)
+; RV32I-NEXT:    lbu t5, 7(a0)
+; RV32I-NEXT:    lbu t2, 8(a0)
+; RV32I-NEXT:    lbu s1, 9(a0)
+; RV32I-NEXT:    lbu s7, 10(a0)
+; RV32I-NEXT:    lbu s8, 11(a0)
+; RV32I-NEXT:    lbu s9, 12(a0)
+; RV32I-NEXT:    lbu s10, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s6, 15(a0)
+; RV32I-NEXT:    lbu s5, 16(a0)
+; RV32I-NEXT:    lbu s11, 17(a0)
+; RV32I-NEXT:    lbu ra, 18(a0)
+; RV32I-NEXT:    lbu a3, 19(a0)
+; RV32I-NEXT:    lbu s2, 20(a0)
+; RV32I-NEXT:    lbu s3, 21(a0)
+; RV32I-NEXT:    lbu a7, 22(a0)
+; RV32I-NEXT:    lbu t0, 23(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a6, t1, a6
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 17(a0)
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    lbu a7, 18(a0)
-; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a7, t2, a7
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    lbu t0, 22(a0)
-; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t0, t3, t0
-; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 25(a0)
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    lbu t1, 26(a0)
-; RV32I-NEXT:    lbu t4, 27(a0)
 ; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    or t2, t3, t2
-; RV32I-NEXT:    slli t1, t1, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t1, t4, t1
-; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 29(a0)
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    lbu t2, 30(a0)
-; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    slli t2, t2, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or t2, a0, t2
-; RV32I-NEXT:    lbu t4, 0(a1)
-; RV32I-NEXT:    lbu t5, 1(a1)
-; RV32I-NEXT:    or t2, t2, t3
-; RV32I-NEXT:    lbu t3, 2(a1)
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    or a4, a4, t6
+; RV32I-NEXT:    sw a4, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    or a5, t3, t1
+; RV32I-NEXT:    or a6, t5, t4
+; RV32I-NEXT:    lbu t1, 24(a0)
+; RV32I-NEXT:    lbu t5, 25(a0)
+; RV32I-NEXT:    lbu t6, 26(a0)
+; RV32I-NEXT:    lbu s0, 27(a0)
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    slli s8, s8, 24
+; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    or t2, s1, t2
+; RV32I-NEXT:    or t3, s8, s7
+; RV32I-NEXT:    or t4, s10, s9
+; RV32I-NEXT:    lbu s1, 28(a0)
+; RV32I-NEXT:    lbu s7, 29(a0)
+; RV32I-NEXT:    lbu s8, 30(a0)
+; RV32I-NEXT:    lbu s9, 31(a0)
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, s6, s4
+; RV32I-NEXT:    or s4, s11, s5
+; RV32I-NEXT:    or s5, a3, ra
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu s6, 1(a1)
+; RV32I-NEXT:    lbu s10, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s2, s3, s2
+; RV32I-NEXT:    addi s3, sp, 8
+; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t0, t0, 24
 ; RV32I-NEXT:    slli t5, t5, 8
-; RV32I-NEXT:    or t4, t5, t4
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t6, t6, 16
+; RV32I-NEXT:    slli s0, s0, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli s10, s10, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t3
-; RV32I-NEXT:    or a1, a1, t4
-; RV32I-NEXT:    srai a0, a0, 31
-; RV32I-NEXT:    sw a0, 48(sp)
-; RV32I-NEXT:    sw a0, 52(sp)
-; RV32I-NEXT:    sw a0, 56(sp)
-; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a0, 32(sp)
-; RV32I-NEXT:    sw a0, 36(sp)
-; RV32I-NEXT:    sw a0, 40(sp)
-; RV32I-NEXT:    sw a0, 44(sp)
-; RV32I-NEXT:    sw a7, 16(sp)
-; RV32I-NEXT:    sw t0, 20(sp)
-; RV32I-NEXT:    sw t1, 24(sp)
-; RV32I-NEXT:    sw t2, 28(sp)
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
-; RV32I-NEXT:    andi a0, a1, 28
-; RV32I-NEXT:    mv a3, sp
-; RV32I-NEXT:    add a6, a3, a0
-; RV32I-NEXT:    lw a3, 0(a6)
-; RV32I-NEXT:    lw a4, 4(a6)
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, t5, t1
+; RV32I-NEXT:    or t1, s0, t6
+; RV32I-NEXT:    or t5, s7, s1
+; RV32I-NEXT:    or t6, s9, s8
+; RV32I-NEXT:    or a3, s6, a3
+; RV32I-NEXT:    or a1, a1, s10
+; RV32I-NEXT:    srai s0, s9, 31
+; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, s1
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t3, t2
+; RV32I-NEXT:    or a0, a0, t4
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    or a7, a7, s2
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    or t1, t6, t5
+; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    sw s0, 56(sp)
+; RV32I-NEXT:    sw s0, 60(sp)
+; RV32I-NEXT:    sw s0, 64(sp)
+; RV32I-NEXT:    sw s0, 68(sp)
+; RV32I-NEXT:    sw s0, 40(sp)
+; RV32I-NEXT:    sw s0, 44(sp)
+; RV32I-NEXT:    sw s0, 48(sp)
+; RV32I-NEXT:    sw s0, 52(sp)
+; RV32I-NEXT:    sw t2, 24(sp)
+; RV32I-NEXT:    sw a7, 28(sp)
+; RV32I-NEXT:    sw t0, 32(sp)
+; RV32I-NEXT:    sw t1, 36(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    sw a0, 20(sp)
 ; RV32I-NEXT:    slli t1, a1, 3
-; RV32I-NEXT:    lw a7, 8(a6)
-; RV32I-NEXT:    lw t0, 12(a6)
+; RV32I-NEXT:    andi a1, a1, 28
+; RV32I-NEXT:    add a1, s3, a1
+; RV32I-NEXT:    andi a0, t1, 24
+; RV32I-NEXT:    xori t0, a0, 31
+; RV32I-NEXT:    lw a3, 0(a1)
+; RV32I-NEXT:    lw a4, 4(a1)
+; RV32I-NEXT:    lw a5, 8(a1)
+; RV32I-NEXT:    lw a6, 12(a1)
+; RV32I-NEXT:    lw a7, 16(a1)
+; RV32I-NEXT:    lw t2, 20(a1)
+; RV32I-NEXT:    lw t3, 24(a1)
+; RV32I-NEXT:    lw t4, 28(a1)
 ; RV32I-NEXT:    srl a0, a4, t1
-; RV32I-NEXT:    andi a1, t1, 24
-; RV32I-NEXT:    xori t2, a1, 31
-; RV32I-NEXT:    slli a1, a7, 1
-; RV32I-NEXT:    sll a1, a1, t2
-; RV32I-NEXT:    or a1, a0, a1
-; RV32I-NEXT:    srl a3, a3, t1
-; RV32I-NEXT:    slli a4, a4, 1
-; RV32I-NEXT:    sll a4, a4, t2
-; RV32I-NEXT:    or a4, a3, a4
-; RV32I-NEXT:    srl a5, t0, t1
-; RV32I-NEXT:    lw t3, 16(a6)
-; RV32I-NEXT:    lw t4, 20(a6)
-; RV32I-NEXT:    lw t5, 24(a6)
-; RV32I-NEXT:    lw t6, 28(a6)
-; RV32I-NEXT:    slli a6, t3, 1
-; RV32I-NEXT:    sll a6, a6, t2
-; RV32I-NEXT:    or a6, a5, a6
-; RV32I-NEXT:    srl a7, a7, t1
-; RV32I-NEXT:    slli t0, t0, 1
-; RV32I-NEXT:    sll t0, t0, t2
-; RV32I-NEXT:    or t0, a7, t0
-; RV32I-NEXT:    srl s0, t4, t1
-; RV32I-NEXT:    slli s1, t5, 1
-; RV32I-NEXT:    sll s1, s1, t2
-; RV32I-NEXT:    or s1, s0, s1
-; RV32I-NEXT:    srl t3, t3, t1
-; RV32I-NEXT:    slli t4, t4, 1
-; RV32I-NEXT:    sll t4, t4, t2
-; RV32I-NEXT:    or t4, t3, t4
-; RV32I-NEXT:    srl t5, t5, t1
-; RV32I-NEXT:    slli s2, t6, 1
-; RV32I-NEXT:    sll t2, s2, t2
-; RV32I-NEXT:    or t2, t5, t2
-; RV32I-NEXT:    sra t1, t6, t1
-; RV32I-NEXT:    srli t6, t1, 24
-; RV32I-NEXT:    srli s2, t1, 16
-; RV32I-NEXT:    srli s3, t1, 8
+; RV32I-NEXT:    slli t5, a5, 1
+; RV32I-NEXT:    srl a1, a3, t1
+; RV32I-NEXT:    slli t6, a4, 1
+; RV32I-NEXT:    srl a3, a6, t1
+; RV32I-NEXT:    slli s0, a7, 1
+; RV32I-NEXT:    srl a4, a5, t1
+; RV32I-NEXT:    slli s1, a6, 1
+; RV32I-NEXT:    srl a5, t2, t1
+; RV32I-NEXT:    slli s2, t3, 1
+; RV32I-NEXT:    srl a6, a7, t1
+; RV32I-NEXT:    slli t2, t2, 1
+; RV32I-NEXT:    srl a7, t3, t1
+; RV32I-NEXT:    slli t3, t4, 1
+; RV32I-NEXT:    sra t1, t4, t1
+; RV32I-NEXT:    sll t4, t5, t0
+; RV32I-NEXT:    sll t5, t6, t0
+; RV32I-NEXT:    sll t6, s0, t0
+; RV32I-NEXT:    sll s0, s1, t0
+; RV32I-NEXT:    sll s1, s2, t0
+; RV32I-NEXT:    sll t2, t2, t0
+; RV32I-NEXT:    sll t3, t3, t0
+; RV32I-NEXT:    srli s2, t1, 24
+; RV32I-NEXT:    srli s3, t1, 16
+; RV32I-NEXT:    srli s4, t1, 8
+; RV32I-NEXT:    or t0, a0, t4
+; RV32I-NEXT:    or t4, a1, t5
+; RV32I-NEXT:    or t5, a3, t6
+; RV32I-NEXT:    or s0, a4, s0
+; RV32I-NEXT:    or s1, a5, s1
+; RV32I-NEXT:    or t2, a6, t2
+; RV32I-NEXT:    or t3, a7, t3
 ; RV32I-NEXT:    sb t1, 28(a2)
-; RV32I-NEXT:    sb s3, 29(a2)
-; RV32I-NEXT:    sb s2, 30(a2)
-; RV32I-NEXT:    sb t6, 31(a2)
-; RV32I-NEXT:    srli t1, t2, 24
-; RV32I-NEXT:    srli t6, t2, 16
+; RV32I-NEXT:    sb s4, 29(a2)
+; RV32I-NEXT:    sb s3, 30(a2)
+; RV32I-NEXT:    sb s2, 31(a2)
+; RV32I-NEXT:    srli t1, t3, 24
+; RV32I-NEXT:    srli t6, t3, 16
+; RV32I-NEXT:    srli t3, t3, 8
+; RV32I-NEXT:    srli s2, t2, 24
+; RV32I-NEXT:    srli s3, t2, 16
 ; RV32I-NEXT:    srli t2, t2, 8
-; RV32I-NEXT:    sb t5, 24(a2)
-; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    srli s4, s1, 24
+; RV32I-NEXT:    srli s5, s1, 16
+; RV32I-NEXT:    srli s1, s1, 8
+; RV32I-NEXT:    srli s6, s0, 24
+; RV32I-NEXT:    srli s7, s0, 16
+; RV32I-NEXT:    srli s0, s0, 8
+; RV32I-NEXT:    srli s8, t5, 24
+; RV32I-NEXT:    srli s9, t5, 16
+; RV32I-NEXT:    srli t5, t5, 8
+; RV32I-NEXT:    srli s10, t4, 24
+; RV32I-NEXT:    srli s11, t4, 16
+; RV32I-NEXT:    srli t4, t4, 8
+; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    sb t3, 25(a2)
 ; RV32I-NEXT:    sb t6, 26(a2)
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli t1, t4, 24
-; RV32I-NEXT:    srli t2, t4, 16
-; RV32I-NEXT:    srli t4, t4, 8
-; RV32I-NEXT:    sb t3, 16(a2)
-; RV32I-NEXT:    sb t4, 17(a2)
-; RV32I-NEXT:    sb t2, 18(a2)
-; RV32I-NEXT:    sb t1, 19(a2)
-; RV32I-NEXT:    srli t1, s1, 24
-; RV32I-NEXT:    srli t2, s1, 16
-; RV32I-NEXT:    srli s1, s1, 8
-; RV32I-NEXT:    sb s0, 20(a2)
-; RV32I-NEXT:    sb s1, 21(a2)
-; RV32I-NEXT:    sb t2, 22(a2)
-; RV32I-NEXT:    sb t1, 23(a2)
-; RV32I-NEXT:    srli t1, t0, 24
-; RV32I-NEXT:    srli t2, t0, 16
+; RV32I-NEXT:    srli a7, t0, 24
+; RV32I-NEXT:    sb a6, 16(a2)
+; RV32I-NEXT:    sb t2, 17(a2)
+; RV32I-NEXT:    sb s3, 18(a2)
+; RV32I-NEXT:    sb s2, 19(a2)
+; RV32I-NEXT:    srli a6, t0, 16
 ; RV32I-NEXT:    srli t0, t0, 8
-; RV32I-NEXT:    sb a7, 8(a2)
-; RV32I-NEXT:    sb t0, 9(a2)
-; RV32I-NEXT:    sb t2, 10(a2)
-; RV32I-NEXT:    sb t1, 11(a2)
-; RV32I-NEXT:    srli a7, a6, 24
-; RV32I-NEXT:    srli t0, a6, 16
-; RV32I-NEXT:    srli a6, a6, 8
-; RV32I-NEXT:    sb a5, 12(a2)
-; RV32I-NEXT:    sb a6, 13(a2)
-; RV32I-NEXT:    sb t0, 14(a2)
-; RV32I-NEXT:    sb a7, 15(a2)
-; RV32I-NEXT:    srli a5, a4, 24
-; RV32I-NEXT:    srli a6, a4, 16
-; RV32I-NEXT:    srli a4, a4, 8
-; RV32I-NEXT:    sb a3, 0(a2)
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    sb a6, 2(a2)
-; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    srli a4, a1, 16
-; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    sb a5, 20(a2)
+; RV32I-NEXT:    sb s1, 21(a2)
+; RV32I-NEXT:    sb s5, 22(a2)
+; RV32I-NEXT:    sb s4, 23(a2)
+; RV32I-NEXT:    sb a4, 8(a2)
+; RV32I-NEXT:    sb s0, 9(a2)
+; RV32I-NEXT:    sb s7, 10(a2)
+; RV32I-NEXT:    sb s6, 11(a2)
+; RV32I-NEXT:    sb a3, 12(a2)
+; RV32I-NEXT:    sb t5, 13(a2)
+; RV32I-NEXT:    sb s9, 14(a2)
+; RV32I-NEXT:    sb s8, 15(a2)
+; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb t4, 1(a2)
+; RV32I-NEXT:    sb s11, 2(a2)
+; RV32I-NEXT:    sb s10, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    sb a1, 5(a2)
-; RV32I-NEXT:    sb a4, 6(a2)
-; RV32I-NEXT:    sb a3, 7(a2)
-; RV32I-NEXT:    lw s0, 76(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 72(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 68(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s3, 64(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 80
+; RV32I-NEXT:    sb t0, 5(a2)
+; RV32I-NEXT:    sb a6, 6(a2)
+; RV32I-NEXT:    sb a7, 7(a2)
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %byteOff = load i256, ptr %byteOff.ptr, align 1
@@ -4750,383 +5082,433 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_32bytes_wordOff:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -64
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 10(a0)
-; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    lbu a5, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 17(a0)
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 18(a0)
-; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 21(a0)
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    lbu a6, 22(a0)
-; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu t1, 27(a0)
 ; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 29(a0)
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    lbu a7, 30(a0)
-; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    slli a7, a0, 32
-; RV64I-NEXT:    lbu t0, 0(a1)
-; RV64I-NEXT:    lbu t1, 1(a1)
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 2(a1)
-; RV64I-NEXT:    lbu t2, 3(a1)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or a7, t2, a7
-; RV64I-NEXT:    lbu t1, 4(a1)
-; RV64I-NEXT:    lbu t2, 5(a1)
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    lbu t0, 6(a1)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    slli s2, s2, 16
+; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t6, 24(a0)
+; RV64I-NEXT:    lbu s0, 25(a0)
+; RV64I-NEXT:    lbu s1, 26(a0)
+; RV64I-NEXT:    lbu s2, 27(a0)
+; RV64I-NEXT:    slli s5, s5, 8
+; RV64I-NEXT:    slli s6, s6, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or t3, s5, s4
+; RV64I-NEXT:    or t4, s7, s6
+; RV64I-NEXT:    or t5, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
+; RV64I-NEXT:    lbu s6, 31(a0)
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s11, s10
+; RV64I-NEXT:    or t6, s0, t6
+; RV64I-NEXT:    or s0, s2, s1
+; RV64I-NEXT:    or s1, s4, s3
+; RV64I-NEXT:    lbu s2, 0(a1)
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or s5, s6, s5
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    lbu s3, 4(a1)
+; RV64I-NEXT:    lbu s6, 5(a1)
+; RV64I-NEXT:    or s4, s7, s4
+; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or t1, t2, t1
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli s6, s6, 8
+; RV64I-NEXT:    or s3, s6, s3
+; RV64I-NEXT:    slli s7, s7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t0
-; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, s7
+; RV64I-NEXT:    mv s6, sp
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    or t0, s0, t6
+; RV64I-NEXT:    or t1, s5, s1
+; RV64I-NEXT:    or t2, s4, s2
+; RV64I-NEXT:    or a1, a1, s3
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    slli t3, t1, 32
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a1, a1, a7
-; RV64I-NEXT:    sraiw a0, a0, 31
-; RV64I-NEXT:    sd a0, 32(sp)
-; RV64I-NEXT:    sd a0, 40(sp)
-; RV64I-NEXT:    sd a0, 48(sp)
-; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sraiw t1, t1, 31
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a5, t3, t0
+; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    sd t1, 32(sp)
+; RV64I-NEXT:    sd t1, 40(sp)
+; RV64I-NEXT:    sd t1, 48(sp)
+; RV64I-NEXT:    sd t1, 56(sp)
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    sd a4, 8(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
-; RV64I-NEXT:    sd a6, 24(sp)
-; RV64I-NEXT:    slli a0, a1, 2
-; RV64I-NEXT:    andi a0, a0, 24
-; RV64I-NEXT:    mv a3, sp
-; RV64I-NEXT:    add a0, a3, a0
-; RV64I-NEXT:    ld a4, 0(a0)
-; RV64I-NEXT:    ld a5, 8(a0)
-; RV64I-NEXT:    slli a6, a1, 5
-; RV64I-NEXT:    ld a7, 16(a0)
-; RV64I-NEXT:    ld t0, 24(a0)
-; RV64I-NEXT:    srl a3, a5, a6
-; RV64I-NEXT:    andi a0, a6, 32
-; RV64I-NEXT:    xori t1, a0, 63
-; RV64I-NEXT:    slli a0, a7, 1
-; RV64I-NEXT:    sll a0, a0, t1
-; RV64I-NEXT:    or a0, a3, a0
-; RV64I-NEXT:    srl t2, a4, a6
+; RV64I-NEXT:    sd a0, 16(sp)
+; RV64I-NEXT:    sd a5, 24(sp)
+; RV64I-NEXT:    slli a3, a1, 5
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    andi a1, a1, 24
+; RV64I-NEXT:    andi a0, a3, 32
+; RV64I-NEXT:    add a1, s6, a1
+; RV64I-NEXT:    ld a4, 0(a1)
+; RV64I-NEXT:    ld a5, 8(a1)
+; RV64I-NEXT:    ld a6, 16(a1)
+; RV64I-NEXT:    xori a7, a0, 63
+; RV64I-NEXT:    ld t0, 24(a1)
+; RV64I-NEXT:    srl a0, a5, a3
+; RV64I-NEXT:    slli t1, a6, 1
+; RV64I-NEXT:    srl a1, a4, a3
 ; RV64I-NEXT:    slli a5, a5, 1
-; RV64I-NEXT:    sll a1, a5, t1
-; RV64I-NEXT:    or a1, t2, a1
-; RV64I-NEXT:    srl a5, a7, a6
-; RV64I-NEXT:    slli a4, t0, 1
-; RV64I-NEXT:    sll a4, a4, t1
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    sra a6, t0, a6
-; RV64I-NEXT:    srli a7, a5, 24
-; RV64I-NEXT:    srli t0, a5, 16
-; RV64I-NEXT:    srli t1, a5, 8
-; RV64I-NEXT:    sb a5, 16(a2)
-; RV64I-NEXT:    sb t1, 17(a2)
-; RV64I-NEXT:    sb t0, 18(a2)
+; RV64I-NEXT:    srl a4, a6, a3
+; RV64I-NEXT:    slli a6, t0, 1
+; RV64I-NEXT:    sra a3, t0, a3
+; RV64I-NEXT:    sll t0, t1, a7
+; RV64I-NEXT:    sll a5, a5, a7
+; RV64I-NEXT:    sll a6, a6, a7
+; RV64I-NEXT:    srli a7, a4, 24
+; RV64I-NEXT:    srli t1, a4, 16
+; RV64I-NEXT:    srli t2, a4, 8
+; RV64I-NEXT:    srli t3, a3, 56
+; RV64I-NEXT:    srli t4, a3, 48
+; RV64I-NEXT:    srli t5, a3, 40
+; RV64I-NEXT:    srli t6, a3, 32
+; RV64I-NEXT:    srli s0, a3, 24
+; RV64I-NEXT:    srli s1, a3, 16
+; RV64I-NEXT:    srli s2, a3, 8
+; RV64I-NEXT:    srli s3, a1, 24
+; RV64I-NEXT:    srli s4, a1, 16
+; RV64I-NEXT:    srli s5, a1, 8
+; RV64I-NEXT:    srli s6, a0, 24
+; RV64I-NEXT:    or a6, a4, a6
+; RV64I-NEXT:    sb a4, 16(a2)
+; RV64I-NEXT:    sb t2, 17(a2)
+; RV64I-NEXT:    sb t1, 18(a2)
 ; RV64I-NEXT:    sb a7, 19(a2)
-; RV64I-NEXT:    srli a5, a6, 56
-; RV64I-NEXT:    srli a7, a6, 48
-; RV64I-NEXT:    srli t0, a6, 40
-; RV64I-NEXT:    srli t1, a6, 32
-; RV64I-NEXT:    sb t1, 28(a2)
-; RV64I-NEXT:    sb t0, 29(a2)
-; RV64I-NEXT:    sb a7, 30(a2)
-; RV64I-NEXT:    sb a5, 31(a2)
-; RV64I-NEXT:    srli a5, a6, 24
-; RV64I-NEXT:    srli a7, a6, 16
-; RV64I-NEXT:    srli t0, a6, 8
-; RV64I-NEXT:    sb a6, 24(a2)
-; RV64I-NEXT:    sb t0, 25(a2)
-; RV64I-NEXT:    sb a7, 26(a2)
-; RV64I-NEXT:    sb a5, 27(a2)
-; RV64I-NEXT:    srli a5, t2, 24
-; RV64I-NEXT:    srli a6, t2, 16
-; RV64I-NEXT:    srli a7, t2, 8
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb a7, 1(a2)
-; RV64I-NEXT:    sb a6, 2(a2)
-; RV64I-NEXT:    sb a5, 3(a2)
-; RV64I-NEXT:    srli a5, a3, 24
-; RV64I-NEXT:    srli a6, a3, 16
-; RV64I-NEXT:    srli a7, a3, 8
-; RV64I-NEXT:    sb a3, 8(a2)
+; RV64I-NEXT:    srli a4, a0, 16
+; RV64I-NEXT:    sb t6, 28(a2)
+; RV64I-NEXT:    sb t5, 29(a2)
+; RV64I-NEXT:    sb t4, 30(a2)
+; RV64I-NEXT:    sb t3, 31(a2)
+; RV64I-NEXT:    srli a7, a0, 8
+; RV64I-NEXT:    or t0, a0, t0
+; RV64I-NEXT:    or a5, a1, a5
+; RV64I-NEXT:    sb a3, 24(a2)
+; RV64I-NEXT:    sb s2, 25(a2)
+; RV64I-NEXT:    sb s1, 26(a2)
+; RV64I-NEXT:    sb s0, 27(a2)
+; RV64I-NEXT:    sb a1, 0(a2)
+; RV64I-NEXT:    sb s5, 1(a2)
+; RV64I-NEXT:    sb s4, 2(a2)
+; RV64I-NEXT:    sb s3, 3(a2)
+; RV64I-NEXT:    sb a0, 8(a2)
 ; RV64I-NEXT:    sb a7, 9(a2)
-; RV64I-NEXT:    sb a6, 10(a2)
-; RV64I-NEXT:    sb a5, 11(a2)
-; RV64I-NEXT:    srli a3, a4, 56
-; RV64I-NEXT:    srli a5, a4, 48
-; RV64I-NEXT:    srli a6, a4, 40
-; RV64I-NEXT:    srli a4, a4, 32
+; RV64I-NEXT:    sb a4, 10(a2)
+; RV64I-NEXT:    sb s6, 11(a2)
+; RV64I-NEXT:    srli a0, a6, 56
+; RV64I-NEXT:    srli a1, a6, 48
+; RV64I-NEXT:    srli a3, a6, 40
+; RV64I-NEXT:    srli a4, a6, 32
+; RV64I-NEXT:    srli a6, a5, 56
+; RV64I-NEXT:    srli a7, a5, 48
+; RV64I-NEXT:    srli t1, a5, 40
+; RV64I-NEXT:    srli a5, a5, 32
+; RV64I-NEXT:    srli t2, t0, 56
+; RV64I-NEXT:    srli t3, t0, 48
+; RV64I-NEXT:    srli t4, t0, 40
+; RV64I-NEXT:    srli t0, t0, 32
 ; RV64I-NEXT:    sb a4, 20(a2)
-; RV64I-NEXT:    sb a6, 21(a2)
-; RV64I-NEXT:    sb a5, 22(a2)
-; RV64I-NEXT:    sb a3, 23(a2)
-; RV64I-NEXT:    srli a3, a1, 56
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    srli a1, a1, 32
-; RV64I-NEXT:    sb a1, 4(a2)
-; RV64I-NEXT:    sb a5, 5(a2)
-; RV64I-NEXT:    sb a4, 6(a2)
-; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    srli a0, a0, 32
-; RV64I-NEXT:    sb a0, 12(a2)
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    sb a3, 14(a2)
-; RV64I-NEXT:    sb a1, 15(a2)
-; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    sb a3, 21(a2)
+; RV64I-NEXT:    sb a1, 22(a2)
+; RV64I-NEXT:    sb a0, 23(a2)
+; RV64I-NEXT:    sb a5, 4(a2)
+; RV64I-NEXT:    sb t1, 5(a2)
+; RV64I-NEXT:    sb a7, 6(a2)
+; RV64I-NEXT:    sb a6, 7(a2)
+; RV64I-NEXT:    sb t0, 12(a2)
+; RV64I-NEXT:    sb t4, 13(a2)
+; RV64I-NEXT:    sb t3, 14(a2)
+; RV64I-NEXT:    sb t2, 15(a2)
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_32bytes_wordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a6, 0(a0)
+; RV32I-NEXT:    lbu t0, 1(a0)
+; RV32I-NEXT:    lbu t1, 2(a0)
+; RV32I-NEXT:    lbu t6, 3(a0)
+; RV32I-NEXT:    lbu s7, 4(a0)
+; RV32I-NEXT:    lbu s8, 5(a0)
+; RV32I-NEXT:    lbu s3, 6(a0)
+; RV32I-NEXT:    lbu s5, 7(a0)
+; RV32I-NEXT:    lbu s4, 8(a0)
+; RV32I-NEXT:    lbu s9, 9(a0)
+; RV32I-NEXT:    lbu s10, 10(a0)
+; RV32I-NEXT:    lbu s11, 11(a0)
+; RV32I-NEXT:    lbu s2, 12(a0)
+; RV32I-NEXT:    lbu s6, 13(a0)
+; RV32I-NEXT:    lbu a5, 14(a0)
+; RV32I-NEXT:    lbu a7, 15(a0)
+; RV32I-NEXT:    lbu a3, 16(a0)
+; RV32I-NEXT:    lbu t2, 17(a0)
+; RV32I-NEXT:    lbu t3, 18(a0)
+; RV32I-NEXT:    lbu t4, 19(a0)
+; RV32I-NEXT:    lbu a4, 20(a0)
+; RV32I-NEXT:    lbu t5, 21(a0)
+; RV32I-NEXT:    lbu s0, 22(a0)
+; RV32I-NEXT:    lbu s1, 23(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    or a6, t0, a6
+; RV32I-NEXT:    or t0, t6, t1
+; RV32I-NEXT:    or t1, s8, s7
+; RV32I-NEXT:    lbu t6, 24(a0)
+; RV32I-NEXT:    lbu s7, 25(a0)
+; RV32I-NEXT:    lbu s8, 26(a0)
+; RV32I-NEXT:    lbu ra, 27(a0)
+; RV32I-NEXT:    slli s3, s3, 16
+; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    or s3, s5, s3
+; RV32I-NEXT:    or s4, s9, s4
+; RV32I-NEXT:    or s5, s11, s10
+; RV32I-NEXT:    lbu s9, 28(a0)
+; RV32I-NEXT:    lbu s10, 29(a0)
+; RV32I-NEXT:    lbu s11, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    or s2, s6, s2
+; RV32I-NEXT:    addi s6, sp, 8
 ; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a6, t1, a6
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 17(a0)
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    lbu a7, 18(a0)
-; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a7, t2, a7
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    lbu t0, 22(a0)
-; RV32I-NEXT:    lbu t3, 23(a0)
 ; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t0, t3, t0
-; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 25(a0)
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    lbu t1, 26(a0)
-; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    or t2, t3, t2
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t1, t4, t1
-; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 29(a0)
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    lbu t2, 30(a0)
-; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    slli s0, s0, 16
+; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli ra, ra, 24
+; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    slli s11, s11, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or t2, a0, t2
-; RV32I-NEXT:    or t2, t2, t3
-; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    or a5, a7, a5
+; RV32I-NEXT:    or a3, t2, a3
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or a4, t5, a4
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    or t2, s7, t6
+; RV32I-NEXT:    or t3, ra, s8
+; RV32I-NEXT:    or t4, s10, s9
+; RV32I-NEXT:    or t5, a0, s11
 ; RV32I-NEXT:    srai a0, a0, 31
-; RV32I-NEXT:    sw a0, 48(sp)
-; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    andi a1, a1, 28
+; RV32I-NEXT:    or a6, t0, a6
+; RV32I-NEXT:    or t0, s3, t1
+; RV32I-NEXT:    or t1, s5, s4
+; RV32I-NEXT:    or a5, a5, s2
+; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    or a4, s0, a4
+; RV32I-NEXT:    or a7, t3, t2
+; RV32I-NEXT:    or t2, t5, t4
 ; RV32I-NEXT:    sw a0, 56(sp)
 ; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a0, 32(sp)
-; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    sw a0, 64(sp)
+; RV32I-NEXT:    sw a0, 68(sp)
 ; RV32I-NEXT:    sw a0, 40(sp)
 ; RV32I-NEXT:    sw a0, 44(sp)
-; RV32I-NEXT:    sw a7, 16(sp)
-; RV32I-NEXT:    sw t0, 20(sp)
-; RV32I-NEXT:    sw t1, 24(sp)
-; RV32I-NEXT:    sw t2, 28(sp)
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
-; RV32I-NEXT:    slli a1, a1, 2
-; RV32I-NEXT:    andi a1, a1, 28
-; RV32I-NEXT:    mv a0, sp
-; RV32I-NEXT:    add a4, a0, a1
-; RV32I-NEXT:    lw a5, 16(a4)
-; RV32I-NEXT:    lw a6, 20(a4)
-; RV32I-NEXT:    lw a7, 24(a4)
-; RV32I-NEXT:    lw a1, 0(a4)
-; RV32I-NEXT:    lw a0, 4(a4)
-; RV32I-NEXT:    lw t0, 8(a4)
-; RV32I-NEXT:    lw a3, 12(a4)
-; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    add s6, s6, a1
+; RV32I-NEXT:    sw a3, 24(sp)
+; RV32I-NEXT:    sw a4, 28(sp)
+; RV32I-NEXT:    sw a7, 32(sp)
+; RV32I-NEXT:    sw t2, 36(sp)
+; RV32I-NEXT:    sw a6, 8(sp)
+; RV32I-NEXT:    sw t0, 12(sp)
+; RV32I-NEXT:    sw t1, 16(sp)
+; RV32I-NEXT:    sw a5, 20(sp)
+; RV32I-NEXT:    lw a6, 16(s6)
+; RV32I-NEXT:    lw a5, 20(s6)
+; RV32I-NEXT:    lw a7, 24(s6)
+; RV32I-NEXT:    lw a1, 0(s6)
+; RV32I-NEXT:    lw a0, 4(s6)
+; RV32I-NEXT:    lw a4, 8(s6)
+; RV32I-NEXT:    lw a3, 12(s6)
+; RV32I-NEXT:    lw t0, 28(s6)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
+; RV32I-NEXT:    srli t4, t0, 24
+; RV32I-NEXT:    srli t5, t0, 16
+; RV32I-NEXT:    srli t6, t0, 8
+; RV32I-NEXT:    srli s0, a6, 24
+; RV32I-NEXT:    srli s1, a6, 16
+; RV32I-NEXT:    srli s2, a6, 8
+; RV32I-NEXT:    srli s3, a5, 24
+; RV32I-NEXT:    srli s4, a5, 16
+; RV32I-NEXT:    srli s5, a5, 8
+; RV32I-NEXT:    srli s6, a4, 24
+; RV32I-NEXT:    srli s7, a4, 16
+; RV32I-NEXT:    srli s8, a4, 8
+; RV32I-NEXT:    srli s9, a3, 24
+; RV32I-NEXT:    srli s10, a3, 16
+; RV32I-NEXT:    srli s11, a3, 8
+; RV32I-NEXT:    srli ra, a1, 24
 ; RV32I-NEXT:    sb a7, 24(a2)
 ; RV32I-NEXT:    sb t3, 25(a2)
 ; RV32I-NEXT:    sb t2, 26(a2)
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli a7, a4, 24
-; RV32I-NEXT:    srli t1, a4, 16
-; RV32I-NEXT:    srli t2, a4, 8
-; RV32I-NEXT:    sb a4, 28(a2)
-; RV32I-NEXT:    sb t2, 29(a2)
-; RV32I-NEXT:    sb t1, 30(a2)
-; RV32I-NEXT:    sb a7, 31(a2)
-; RV32I-NEXT:    srli a4, a5, 24
-; RV32I-NEXT:    srli a7, a5, 16
-; RV32I-NEXT:    srli t1, a5, 8
-; RV32I-NEXT:    sb a5, 16(a2)
-; RV32I-NEXT:    sb t1, 17(a2)
-; RV32I-NEXT:    sb a7, 18(a2)
-; RV32I-NEXT:    sb a4, 19(a2)
-; RV32I-NEXT:    srli a4, a6, 24
-; RV32I-NEXT:    srli a5, a6, 16
-; RV32I-NEXT:    srli a7, a6, 8
-; RV32I-NEXT:    sb a6, 20(a2)
-; RV32I-NEXT:    sb a7, 21(a2)
-; RV32I-NEXT:    sb a5, 22(a2)
-; RV32I-NEXT:    sb a4, 23(a2)
-; RV32I-NEXT:    srli a4, t0, 24
-; RV32I-NEXT:    srli a5, t0, 16
-; RV32I-NEXT:    srli a6, t0, 8
-; RV32I-NEXT:    sb t0, 8(a2)
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    sb a5, 10(a2)
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    srli a5, a3, 16
-; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    srli a7, a1, 16
+; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    sb t6, 29(a2)
+; RV32I-NEXT:    sb t5, 30(a2)
+; RV32I-NEXT:    sb t4, 31(a2)
+; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    sb a6, 16(a2)
+; RV32I-NEXT:    sb s2, 17(a2)
+; RV32I-NEXT:    sb s1, 18(a2)
+; RV32I-NEXT:    sb s0, 19(a2)
+; RV32I-NEXT:    srli a6, a0, 24
+; RV32I-NEXT:    sb a5, 20(a2)
+; RV32I-NEXT:    sb s5, 21(a2)
+; RV32I-NEXT:    sb s4, 22(a2)
+; RV32I-NEXT:    sb s3, 23(a2)
+; RV32I-NEXT:    srli a5, a0, 16
+; RV32I-NEXT:    sb a4, 8(a2)
+; RV32I-NEXT:    sb s8, 9(a2)
+; RV32I-NEXT:    sb s7, 10(a2)
+; RV32I-NEXT:    sb s6, 11(a2)
+; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a6, 13(a2)
-; RV32I-NEXT:    sb a5, 14(a2)
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    srli a4, a1, 16
-; RV32I-NEXT:    srli a5, a1, 8
+; RV32I-NEXT:    sb s11, 13(a2)
+; RV32I-NEXT:    sb s10, 14(a2)
+; RV32I-NEXT:    sb s9, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    sb a4, 2(a2)
-; RV32I-NEXT:    sb a3, 3(a2)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    sb t0, 1(a2)
+; RV32I-NEXT:    sb a7, 2(a2)
+; RV32I-NEXT:    sb ra, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    sb a5, 6(a2)
+; RV32I-NEXT:    sb a6, 7(a2)
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %wordOff = load i256, ptr %wordOff.ptr, align 1
@@ -5139,346 +5521,396 @@ define void @ashr_32bytes_wordOff(ptr %src.ptr, ptr %wordOff.ptr, ptr %dst) noun
 define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_32bytes_dwordOff:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -64
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
-; RV64I-NEXT:    lbu a5, 2(a0)
-; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 10(a0)
-; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    lbu a5, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 17(a0)
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 18(a0)
-; RV64I-NEXT:    lbu t0, 19(a0)
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a5, 0(a0)
+; RV64I-NEXT:    lbu a7, 1(a0)
+; RV64I-NEXT:    lbu t1, 2(a0)
+; RV64I-NEXT:    lbu s3, 3(a0)
+; RV64I-NEXT:    lbu t0, 4(a0)
+; RV64I-NEXT:    lbu s8, 5(a0)
+; RV64I-NEXT:    lbu s9, 6(a0)
+; RV64I-NEXT:    lbu s10, 7(a0)
+; RV64I-NEXT:    lbu s2, 8(a0)
+; RV64I-NEXT:    lbu s4, 9(a0)
+; RV64I-NEXT:    lbu s5, 10(a0)
+; RV64I-NEXT:    lbu s6, 11(a0)
+; RV64I-NEXT:    lbu s7, 12(a0)
+; RV64I-NEXT:    lbu s11, 13(a0)
+; RV64I-NEXT:    lbu t4, 14(a0)
+; RV64I-NEXT:    lbu t5, 15(a0)
+; RV64I-NEXT:    lbu a3, 16(a0)
+; RV64I-NEXT:    lbu a6, 17(a0)
+; RV64I-NEXT:    lbu t2, 18(a0)
+; RV64I-NEXT:    lbu t3, 19(a0)
+; RV64I-NEXT:    lbu a4, 20(a0)
+; RV64I-NEXT:    lbu t6, 21(a0)
+; RV64I-NEXT:    lbu s0, 22(a0)
+; RV64I-NEXT:    lbu s1, 23(a0)
 ; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 21(a0)
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    lbu a6, 22(a0)
-; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 29(a0)
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    lbu a7, 30(a0)
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    slli s9, s9, 16
+; RV64I-NEXT:    slli s10, s10, 24
+; RV64I-NEXT:    or a5, a7, a5
+; RV64I-NEXT:    or a7, s3, t1
+; RV64I-NEXT:    or t0, s8, t0
+; RV64I-NEXT:    or t1, s10, s9
+; RV64I-NEXT:    lbu s3, 24(a0)
+; RV64I-NEXT:    lbu s8, 25(a0)
+; RV64I-NEXT:    lbu s9, 26(a0)
+; RV64I-NEXT:    lbu s10, 27(a0)
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s11, s11, 8
+; RV64I-NEXT:    or s2, s4, s2
+; RV64I-NEXT:    or s4, s6, s5
+; RV64I-NEXT:    or s5, s11, s7
+; RV64I-NEXT:    lbu s6, 28(a0)
+; RV64I-NEXT:    lbu s7, 29(a0)
+; RV64I-NEXT:    lbu s11, 30(a0)
 ; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    slli a7, a0, 32
-; RV64I-NEXT:    or a6, a7, a6
 ; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    slli t4, t4, 16
+; RV64I-NEXT:    slli t5, t5, 24
+; RV64I-NEXT:    or t4, t5, t4
+; RV64I-NEXT:    mv t5, sp
+; RV64I-NEXT:    slli a6, a6, 8
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    slli t6, t6, 8
+; RV64I-NEXT:    slli s0, s0, 16
+; RV64I-NEXT:    slli s1, s1, 24
+; RV64I-NEXT:    slli s8, s8, 8
+; RV64I-NEXT:    slli s9, s9, 16
+; RV64I-NEXT:    slli s10, s10, 24
+; RV64I-NEXT:    slli s7, s7, 8
+; RV64I-NEXT:    slli s11, s11, 16
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a6, t3, t2
+; RV64I-NEXT:    or a4, t6, a4
+; RV64I-NEXT:    or s0, s1, s0
+; RV64I-NEXT:    or t2, s8, s3
+; RV64I-NEXT:    or t3, s10, s9
+; RV64I-NEXT:    or t6, s7, s6
+; RV64I-NEXT:    or a0, a0, s11
+; RV64I-NEXT:    andi a1, a1, 24
+; RV64I-NEXT:    or a5, a7, a5
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or t0, s4, s2
+; RV64I-NEXT:    or t1, t4, s5
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    or a4, s0, a4
+; RV64I-NEXT:    or a6, t3, t2
+; RV64I-NEXT:    or a0, a0, t6
+; RV64I-NEXT:    add t5, t5, a1
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a1, a0, 32
 ; RV64I-NEXT:    sraiw a0, a0, 31
+; RV64I-NEXT:    or a5, a7, a5
+; RV64I-NEXT:    or a7, t1, t0
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a1, a1, a6
 ; RV64I-NEXT:    sd a0, 32(sp)
 ; RV64I-NEXT:    sd a0, 40(sp)
 ; RV64I-NEXT:    sd a0, 48(sp)
 ; RV64I-NEXT:    sd a0, 56(sp)
-; RV64I-NEXT:    sd a3, 0(sp)
-; RV64I-NEXT:    sd a4, 8(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
-; RV64I-NEXT:    sd a6, 24(sp)
-; RV64I-NEXT:    slli a1, a1, 3
-; RV64I-NEXT:    andi a1, a1, 24
-; RV64I-NEXT:    mv a0, sp
-; RV64I-NEXT:    add a3, a0, a1
-; RV64I-NEXT:    ld a4, 16(a3)
-; RV64I-NEXT:    ld a0, 8(a3)
-; RV64I-NEXT:    ld a1, 0(a3)
-; RV64I-NEXT:    ld a3, 24(a3)
+; RV64I-NEXT:    sd a5, 0(sp)
+; RV64I-NEXT:    sd a7, 8(sp)
+; RV64I-NEXT:    sd a3, 16(sp)
+; RV64I-NEXT:    sd a1, 24(sp)
+; RV64I-NEXT:    ld a4, 16(t5)
+; RV64I-NEXT:    ld a0, 8(t5)
+; RV64I-NEXT:    ld a1, 0(t5)
+; RV64I-NEXT:    ld a3, 24(t5)
 ; RV64I-NEXT:    srli a5, a4, 56
 ; RV64I-NEXT:    srli a6, a4, 48
 ; RV64I-NEXT:    srli a7, a4, 40
 ; RV64I-NEXT:    srli t0, a4, 32
+; RV64I-NEXT:    srli t1, a4, 24
+; RV64I-NEXT:    srli t2, a4, 16
+; RV64I-NEXT:    srli t3, a4, 8
+; RV64I-NEXT:    srli t4, a3, 56
+; RV64I-NEXT:    srli t5, a3, 48
+; RV64I-NEXT:    srli t6, a3, 40
+; RV64I-NEXT:    srli s0, a3, 32
+; RV64I-NEXT:    srli s1, a3, 24
+; RV64I-NEXT:    srli s2, a3, 16
+; RV64I-NEXT:    srli s3, a3, 8
+; RV64I-NEXT:    srli s4, a1, 56
+; RV64I-NEXT:    srli s5, a1, 48
+; RV64I-NEXT:    srli s6, a1, 40
+; RV64I-NEXT:    srli s7, a1, 32
+; RV64I-NEXT:    srli s8, a1, 24
+; RV64I-NEXT:    srli s9, a1, 16
+; RV64I-NEXT:    srli s10, a1, 8
+; RV64I-NEXT:    srli s11, a0, 56
 ; RV64I-NEXT:    sb t0, 20(a2)
 ; RV64I-NEXT:    sb a7, 21(a2)
 ; RV64I-NEXT:    sb a6, 22(a2)
 ; RV64I-NEXT:    sb a5, 23(a2)
-; RV64I-NEXT:    srli a5, a4, 24
-; RV64I-NEXT:    srli a6, a4, 16
-; RV64I-NEXT:    srli a7, a4, 8
+; RV64I-NEXT:    srli a5, a0, 48
 ; RV64I-NEXT:    sb a4, 16(a2)
-; RV64I-NEXT:    sb a7, 17(a2)
-; RV64I-NEXT:    sb a6, 18(a2)
-; RV64I-NEXT:    sb a5, 19(a2)
-; RV64I-NEXT:    srli a4, a3, 56
-; RV64I-NEXT:    srli a5, a3, 48
-; RV64I-NEXT:    srli a6, a3, 40
-; RV64I-NEXT:    srli a7, a3, 32
-; RV64I-NEXT:    sb a7, 28(a2)
-; RV64I-NEXT:    sb a6, 29(a2)
-; RV64I-NEXT:    sb a5, 30(a2)
-; RV64I-NEXT:    sb a4, 31(a2)
-; RV64I-NEXT:    srli a4, a3, 24
-; RV64I-NEXT:    srli a5, a3, 16
-; RV64I-NEXT:    srli a6, a3, 8
+; RV64I-NEXT:    sb t3, 17(a2)
+; RV64I-NEXT:    sb t2, 18(a2)
+; RV64I-NEXT:    sb t1, 19(a2)
+; RV64I-NEXT:    srli a4, a0, 40
+; RV64I-NEXT:    sb s0, 28(a2)
+; RV64I-NEXT:    sb t6, 29(a2)
+; RV64I-NEXT:    sb t5, 30(a2)
+; RV64I-NEXT:    sb t4, 31(a2)
+; RV64I-NEXT:    srli a6, a0, 32
 ; RV64I-NEXT:    sb a3, 24(a2)
-; RV64I-NEXT:    sb a6, 25(a2)
-; RV64I-NEXT:    sb a5, 26(a2)
-; RV64I-NEXT:    sb a4, 27(a2)
-; RV64I-NEXT:    srli a3, a1, 56
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    srli a6, a1, 32
-; RV64I-NEXT:    sb a6, 4(a2)
-; RV64I-NEXT:    sb a5, 5(a2)
-; RV64I-NEXT:    sb a4, 6(a2)
-; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a5, a1, 8
+; RV64I-NEXT:    sb s3, 25(a2)
+; RV64I-NEXT:    sb s2, 26(a2)
+; RV64I-NEXT:    sb s1, 27(a2)
+; RV64I-NEXT:    srli a3, a0, 24
+; RV64I-NEXT:    sb s7, 4(a2)
+; RV64I-NEXT:    sb s6, 5(a2)
+; RV64I-NEXT:    sb s5, 6(a2)
+; RV64I-NEXT:    sb s4, 7(a2)
+; RV64I-NEXT:    srli a7, a0, 16
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb a5, 1(a2)
-; RV64I-NEXT:    sb a4, 2(a2)
-; RV64I-NEXT:    sb a3, 3(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 12(a2)
+; RV64I-NEXT:    sb s10, 1(a2)
+; RV64I-NEXT:    sb s9, 2(a2)
+; RV64I-NEXT:    sb s8, 3(a2)
+; RV64I-NEXT:    srli a1, a0, 8
+; RV64I-NEXT:    sb a6, 12(a2)
 ; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    sb a3, 14(a2)
-; RV64I-NEXT:    sb a1, 15(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    srli a3, a0, 16
-; RV64I-NEXT:    srli a4, a0, 8
+; RV64I-NEXT:    sb a5, 14(a2)
+; RV64I-NEXT:    sb s11, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    sb a4, 9(a2)
-; RV64I-NEXT:    sb a3, 10(a2)
-; RV64I-NEXT:    sb a1, 11(a2)
-; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    sb a7, 10(a2)
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_32bytes_dwordOff:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a6, 0(a0)
+; RV32I-NEXT:    lbu t0, 1(a0)
+; RV32I-NEXT:    lbu t1, 2(a0)
+; RV32I-NEXT:    lbu t6, 3(a0)
+; RV32I-NEXT:    lbu s7, 4(a0)
+; RV32I-NEXT:    lbu s8, 5(a0)
+; RV32I-NEXT:    lbu s3, 6(a0)
+; RV32I-NEXT:    lbu s5, 7(a0)
+; RV32I-NEXT:    lbu s4, 8(a0)
+; RV32I-NEXT:    lbu s9, 9(a0)
+; RV32I-NEXT:    lbu s10, 10(a0)
+; RV32I-NEXT:    lbu s11, 11(a0)
+; RV32I-NEXT:    lbu s2, 12(a0)
+; RV32I-NEXT:    lbu s6, 13(a0)
+; RV32I-NEXT:    lbu a5, 14(a0)
+; RV32I-NEXT:    lbu a7, 15(a0)
+; RV32I-NEXT:    lbu a3, 16(a0)
+; RV32I-NEXT:    lbu t2, 17(a0)
+; RV32I-NEXT:    lbu t3, 18(a0)
+; RV32I-NEXT:    lbu t4, 19(a0)
+; RV32I-NEXT:    lbu a4, 20(a0)
+; RV32I-NEXT:    lbu t5, 21(a0)
+; RV32I-NEXT:    lbu s0, 22(a0)
+; RV32I-NEXT:    lbu s1, 23(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    slli s8, s8, 8
+; RV32I-NEXT:    or a6, t0, a6
+; RV32I-NEXT:    or t0, t6, t1
+; RV32I-NEXT:    or t1, s8, s7
+; RV32I-NEXT:    lbu t6, 24(a0)
+; RV32I-NEXT:    lbu s7, 25(a0)
+; RV32I-NEXT:    lbu s8, 26(a0)
+; RV32I-NEXT:    lbu ra, 27(a0)
+; RV32I-NEXT:    slli s3, s3, 16
+; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    or s3, s5, s3
+; RV32I-NEXT:    or s4, s9, s4
+; RV32I-NEXT:    or s5, s11, s10
+; RV32I-NEXT:    lbu s9, 28(a0)
+; RV32I-NEXT:    lbu s10, 29(a0)
+; RV32I-NEXT:    lbu s11, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    or s2, s6, s2
+; RV32I-NEXT:    addi s6, sp, 8
 ; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a6, t1, a6
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 17(a0)
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    lbu a7, 18(a0)
-; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a7, t2, a7
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    or a7, a7, t0
-; RV32I-NEXT:    lbu t0, 22(a0)
-; RV32I-NEXT:    lbu t3, 23(a0)
 ; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t0, t3, t0
-; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 25(a0)
-; RV32I-NEXT:    or t0, t0, t1
-; RV32I-NEXT:    lbu t1, 26(a0)
-; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    or t2, t3, t2
-; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t3, t3, 16
 ; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t1, t4, t1
-; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 29(a0)
-; RV32I-NEXT:    or t1, t1, t2
-; RV32I-NEXT:    lbu t2, 30(a0)
-; RV32I-NEXT:    lbu a0, 31(a0)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    slli s0, s0, 16
+; RV32I-NEXT:    slli s1, s1, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli ra, ra, 24
+; RV32I-NEXT:    slli s10, s10, 8
+; RV32I-NEXT:    slli s11, s11, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or t2, a0, t2
-; RV32I-NEXT:    or t2, t2, t3
-; RV32I-NEXT:    lbu a1, 0(a1)
+; RV32I-NEXT:    slli a1, a1, 3
+; RV32I-NEXT:    or a5, a7, a5
+; RV32I-NEXT:    or a3, t2, a3
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or a4, t5, a4
+; RV32I-NEXT:    or s0, s1, s0
+; RV32I-NEXT:    or t2, s7, t6
+; RV32I-NEXT:    or t3, ra, s8
+; RV32I-NEXT:    or t4, s10, s9
+; RV32I-NEXT:    or t5, a0, s11
 ; RV32I-NEXT:    srai a0, a0, 31
-; RV32I-NEXT:    sw a0, 48(sp)
-; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    andi a1, a1, 24
+; RV32I-NEXT:    or a6, t0, a6
+; RV32I-NEXT:    or t0, s3, t1
+; RV32I-NEXT:    or t1, s5, s4
+; RV32I-NEXT:    or a5, a5, s2
+; RV32I-NEXT:    or a3, a7, a3
+; RV32I-NEXT:    or a4, s0, a4
+; RV32I-NEXT:    or a7, t3, t2
+; RV32I-NEXT:    or t2, t5, t4
 ; RV32I-NEXT:    sw a0, 56(sp)
 ; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a0, 32(sp)
-; RV32I-NEXT:    sw a0, 36(sp)
+; RV32I-NEXT:    sw a0, 64(sp)
+; RV32I-NEXT:    sw a0, 68(sp)
 ; RV32I-NEXT:    sw a0, 40(sp)
 ; RV32I-NEXT:    sw a0, 44(sp)
-; RV32I-NEXT:    sw a7, 16(sp)
-; RV32I-NEXT:    sw t0, 20(sp)
-; RV32I-NEXT:    sw t1, 24(sp)
-; RV32I-NEXT:    sw t2, 28(sp)
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
-; RV32I-NEXT:    slli a1, a1, 3
-; RV32I-NEXT:    andi a1, a1, 24
-; RV32I-NEXT:    mv a0, sp
-; RV32I-NEXT:    add a4, a0, a1
-; RV32I-NEXT:    lw a5, 16(a4)
-; RV32I-NEXT:    lw a6, 20(a4)
-; RV32I-NEXT:    lw a7, 24(a4)
-; RV32I-NEXT:    lw a1, 0(a4)
-; RV32I-NEXT:    lw a0, 4(a4)
-; RV32I-NEXT:    lw t0, 8(a4)
-; RV32I-NEXT:    lw a3, 12(a4)
-; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    sw a0, 48(sp)
+; RV32I-NEXT:    sw a0, 52(sp)
+; RV32I-NEXT:    add s6, s6, a1
+; RV32I-NEXT:    sw a3, 24(sp)
+; RV32I-NEXT:    sw a4, 28(sp)
+; RV32I-NEXT:    sw a7, 32(sp)
+; RV32I-NEXT:    sw t2, 36(sp)
+; RV32I-NEXT:    sw a6, 8(sp)
+; RV32I-NEXT:    sw t0, 12(sp)
+; RV32I-NEXT:    sw t1, 16(sp)
+; RV32I-NEXT:    sw a5, 20(sp)
+; RV32I-NEXT:    lw a6, 16(s6)
+; RV32I-NEXT:    lw a5, 20(s6)
+; RV32I-NEXT:    lw a7, 24(s6)
+; RV32I-NEXT:    lw a1, 0(s6)
+; RV32I-NEXT:    lw a0, 4(s6)
+; RV32I-NEXT:    lw a4, 8(s6)
+; RV32I-NEXT:    lw a3, 12(s6)
+; RV32I-NEXT:    lw t0, 28(s6)
 ; RV32I-NEXT:    srli t1, a7, 24
 ; RV32I-NEXT:    srli t2, a7, 16
 ; RV32I-NEXT:    srli t3, a7, 8
+; RV32I-NEXT:    srli t4, t0, 24
+; RV32I-NEXT:    srli t5, t0, 16
+; RV32I-NEXT:    srli t6, t0, 8
+; RV32I-NEXT:    srli s0, a6, 24
+; RV32I-NEXT:    srli s1, a6, 16
+; RV32I-NEXT:    srli s2, a6, 8
+; RV32I-NEXT:    srli s3, a5, 24
+; RV32I-NEXT:    srli s4, a5, 16
+; RV32I-NEXT:    srli s5, a5, 8
+; RV32I-NEXT:    srli s6, a4, 24
+; RV32I-NEXT:    srli s7, a4, 16
+; RV32I-NEXT:    srli s8, a4, 8
+; RV32I-NEXT:    srli s9, a3, 24
+; RV32I-NEXT:    srli s10, a3, 16
+; RV32I-NEXT:    srli s11, a3, 8
+; RV32I-NEXT:    srli ra, a1, 24
 ; RV32I-NEXT:    sb a7, 24(a2)
 ; RV32I-NEXT:    sb t3, 25(a2)
 ; RV32I-NEXT:    sb t2, 26(a2)
 ; RV32I-NEXT:    sb t1, 27(a2)
-; RV32I-NEXT:    srli a7, a4, 24
-; RV32I-NEXT:    srli t1, a4, 16
-; RV32I-NEXT:    srli t2, a4, 8
-; RV32I-NEXT:    sb a4, 28(a2)
-; RV32I-NEXT:    sb t2, 29(a2)
-; RV32I-NEXT:    sb t1, 30(a2)
-; RV32I-NEXT:    sb a7, 31(a2)
-; RV32I-NEXT:    srli a4, a5, 24
-; RV32I-NEXT:    srli a7, a5, 16
-; RV32I-NEXT:    srli t1, a5, 8
-; RV32I-NEXT:    sb a5, 16(a2)
-; RV32I-NEXT:    sb t1, 17(a2)
-; RV32I-NEXT:    sb a7, 18(a2)
-; RV32I-NEXT:    sb a4, 19(a2)
-; RV32I-NEXT:    srli a4, a6, 24
-; RV32I-NEXT:    srli a5, a6, 16
-; RV32I-NEXT:    srli a7, a6, 8
-; RV32I-NEXT:    sb a6, 20(a2)
-; RV32I-NEXT:    sb a7, 21(a2)
-; RV32I-NEXT:    sb a5, 22(a2)
-; RV32I-NEXT:    sb a4, 23(a2)
-; RV32I-NEXT:    srli a4, t0, 24
-; RV32I-NEXT:    srli a5, t0, 16
-; RV32I-NEXT:    srli a6, t0, 8
-; RV32I-NEXT:    sb t0, 8(a2)
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    sb a5, 10(a2)
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    srli a5, a3, 16
-; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    srli a7, a1, 16
+; RV32I-NEXT:    sb t0, 28(a2)
+; RV32I-NEXT:    sb t6, 29(a2)
+; RV32I-NEXT:    sb t5, 30(a2)
+; RV32I-NEXT:    sb t4, 31(a2)
+; RV32I-NEXT:    srli t0, a1, 8
+; RV32I-NEXT:    sb a6, 16(a2)
+; RV32I-NEXT:    sb s2, 17(a2)
+; RV32I-NEXT:    sb s1, 18(a2)
+; RV32I-NEXT:    sb s0, 19(a2)
+; RV32I-NEXT:    srli a6, a0, 24
+; RV32I-NEXT:    sb a5, 20(a2)
+; RV32I-NEXT:    sb s5, 21(a2)
+; RV32I-NEXT:    sb s4, 22(a2)
+; RV32I-NEXT:    sb s3, 23(a2)
+; RV32I-NEXT:    srli a5, a0, 16
+; RV32I-NEXT:    sb a4, 8(a2)
+; RV32I-NEXT:    sb s8, 9(a2)
+; RV32I-NEXT:    sb s7, 10(a2)
+; RV32I-NEXT:    sb s6, 11(a2)
+; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a6, 13(a2)
-; RV32I-NEXT:    sb a5, 14(a2)
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    srli a4, a1, 16
-; RV32I-NEXT:    srli a5, a1, 8
+; RV32I-NEXT:    sb s11, 13(a2)
+; RV32I-NEXT:    sb s10, 14(a2)
+; RV32I-NEXT:    sb s9, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    sb a4, 2(a2)
-; RV32I-NEXT:    sb a3, 3(a2)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    sb t0, 1(a2)
+; RV32I-NEXT:    sb a7, 2(a2)
+; RV32I-NEXT:    sb ra, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    sb a5, 6(a2)
+; RV32I-NEXT:    sb a6, 7(a2)
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %dwordOff = load i256, ptr %dwordOff.ptr, align 1

diff  --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index d36c660b3b1421..b2c130c2d7c10a 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -11,8 +11,8 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lb a0, 3(a0)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    or a0, a0, a3
@@ -29,25 +29,25 @@ define void @lshr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-LABEL: lshr_4bytes:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    lbu a4, 2(a0)
+; RV32I-NEXT:    lbu a5, 3(a0)
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 1(a1)
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    slli a5, a5, 24
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    slli a3, a3, 16
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a0, a4, a0
 ; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 16
 ; RV32I-NEXT:    srli a3, a0, 24
@@ -72,8 +72,8 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lb a0, 3(a0)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    or a0, a0, a3
@@ -90,25 +90,25 @@ define void @shl_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-LABEL: shl_4bytes:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    lbu a4, 2(a0)
+; RV32I-NEXT:    lbu a5, 3(a0)
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 1(a1)
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    slli a5, a5, 24
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    slli a3, a3, 16
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a0, a4, a0
 ; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 16
 ; RV32I-NEXT:    srli a3, a0, 24
@@ -133,8 +133,8 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lb a0, 3(a0)
 ; RV64I-NEXT:    slli a3, a3, 8
 ; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    lbu a1, 0(a1)
+; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    or a0, a0, a5
 ; RV64I-NEXT:    or a0, a0, a3
@@ -151,25 +151,25 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-LABEL: ashr_4bytes:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a0, 3(a0)
+; RV32I-NEXT:    lbu a4, 2(a0)
+; RV32I-NEXT:    lbu a5, 3(a0)
+; RV32I-NEXT:    lbu a0, 0(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a5
-; RV32I-NEXT:    lbu a4, 0(a1)
-; RV32I-NEXT:    lbu a5, 1(a1)
-; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    lbu a3, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    slli a5, a5, 24
+; RV32I-NEXT:    or a0, a3, a0
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu a6, 1(a1)
 ; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    slli a3, a3, 16
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a0, a4, a0
 ; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    or a1, a1, a4
 ; RV32I-NEXT:    sra a0, a0, a1
 ; RV32I-NEXT:    srli a1, a0, 16
 ; RV32I-NEXT:    srli a3, a0, 24
@@ -189,93 +189,93 @@ define void @ashr_4bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t2, 3(a1)
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    or t0, t2, t0
+; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or a7, t1, a7
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a4, t0, a6
+; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    lbu a4, 0(a1)
-; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 2(a1)
-; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    slli a3, a3, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 5(a1)
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    lbu a4, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    srl a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 48
 ; RV64I-NEXT:    srli a3, a0, 56
 ; RV64I-NEXT:    srli a4, a0, 32
 ; RV64I-NEXT:    srli a5, a0, 40
+; RV64I-NEXT:    srli a6, a0, 16
+; RV64I-NEXT:    srli a7, a0, 24
+; RV64I-NEXT:    srli t0, a0, 8
 ; RV64I-NEXT:    sb a4, 4(a2)
 ; RV64I-NEXT:    sb a5, 5(a2)
 ; RV64I-NEXT:    sb a1, 6(a2)
 ; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    srli a3, a0, 24
-; RV64I-NEXT:    srli a4, a0, 8
 ; RV64I-NEXT:    sb a0, 0(a2)
-; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    sb a1, 2(a2)
-; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    sb t0, 1(a2)
+; RV64I-NEXT:    sb a6, 2(a2)
+; RV64I-NEXT:    sb a7, 3(a2)
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_8bytes:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lbu a3, 5(a0)
-; RV32I-NEXT:    lbu a4, 4(a0)
-; RV32I-NEXT:    lbu a5, 6(a0)
-; RV32I-NEXT:    lbu a6, 7(a0)
+; RV32I-NEXT:    lbu a4, 6(a0)
+; RV32I-NEXT:    lbu a5, 7(a0)
+; RV32I-NEXT:    lbu a6, 4(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
 ; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    slli a5, a5, 24
+; RV32I-NEXT:    or a3, a3, a6
+; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a4
-; RV32I-NEXT:    or a5, a1, a5
-; RV32I-NEXT:    addi a4, a5, -32
-; RV32I-NEXT:    srl a1, a3, a5
-; RV32I-NEXT:    bltz a4, .LBB3_2
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a5, a4, a3
+; RV32I-NEXT:    or a4, a1, a6
+; RV32I-NEXT:    addi a3, a4, -32
+; RV32I-NEXT:    srl a1, a5, a4
+; RV32I-NEXT:    bltz a3, .LBB3_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    j .LBB3_3
@@ -289,29 +289,29 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    not a7, a4
+; RV32I-NEXT:    slli a5, a5, 1
 ; RV32I-NEXT:    or a0, a0, a6
-; RV32I-NEXT:    srl a0, a0, a5
-; RV32I-NEXT:    not a5, a5
-; RV32I-NEXT:    slli a3, a3, 1
-; RV32I-NEXT:    sll a3, a3, a5
-; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    srl a0, a0, a4
+; RV32I-NEXT:    sll a4, a5, a7
+; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:  .LBB3_3:
-; RV32I-NEXT:    srai a4, a4, 31
-; RV32I-NEXT:    and a1, a4, a1
+; RV32I-NEXT:    srai a3, a3, 31
+; RV32I-NEXT:    srli a4, a0, 16
+; RV32I-NEXT:    srli a5, a0, 24
+; RV32I-NEXT:    and a1, a3, a1
 ; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    srli a4, a1, 24
-; RV32I-NEXT:    srli a5, a1, 8
+; RV32I-NEXT:    srli a6, a1, 24
+; RV32I-NEXT:    srli a7, a1, 8
 ; RV32I-NEXT:    sb a1, 4(a2)
-; RV32I-NEXT:    sb a5, 5(a2)
+; RV32I-NEXT:    sb a7, 5(a2)
 ; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    sb a6, 7(a2)
+; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    sb a1, 2(a2)
-; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    sb a4, 2(a2)
+; RV32I-NEXT:    sb a5, 3(a2)
 ; RV32I-NEXT:    ret
   %src = load i64, ptr %src.ptr, align 1
   %bitOff = load i64, ptr %bitOff.ptr, align 1
@@ -322,93 +322,93 @@ define void @lshr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t2, 3(a1)
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a4
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    or t0, t2, t0
+; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or a7, t1, a7
+; RV64I-NEXT:    slli t2, t2, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a4, t0, a6
+; RV64I-NEXT:    or a1, a1, a7
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    lbu a4, 0(a1)
-; RV64I-NEXT:    lbu a5, 1(a1)
+; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 2(a1)
-; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    slli a3, a3, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 5(a1)
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    lbu a4, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a1, a1, 24
 ; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a1, a1, a3
 ; RV64I-NEXT:    sll a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 48
 ; RV64I-NEXT:    srli a3, a0, 56
 ; RV64I-NEXT:    srli a4, a0, 32
 ; RV64I-NEXT:    srli a5, a0, 40
+; RV64I-NEXT:    srli a6, a0, 16
+; RV64I-NEXT:    srli a7, a0, 24
+; RV64I-NEXT:    srli t0, a0, 8
 ; RV64I-NEXT:    sb a4, 4(a2)
 ; RV64I-NEXT:    sb a5, 5(a2)
 ; RV64I-NEXT:    sb a1, 6(a2)
 ; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    srli a3, a0, 24
-; RV64I-NEXT:    srli a4, a0, 8
 ; RV64I-NEXT:    sb a0, 0(a2)
-; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    sb a1, 2(a2)
-; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    sb t0, 1(a2)
+; RV64I-NEXT:    sb a6, 2(a2)
+; RV64I-NEXT:    sb a7, 3(a2)
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_8bytes:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
+; RV32I-NEXT:    lbu a4, 2(a0)
+; RV32I-NEXT:    lbu a5, 3(a0)
+; RV32I-NEXT:    lbu a6, 0(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 0(a1)
-; RV32I-NEXT:    lbu a6, 1(a1)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
 ; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    slli a5, a5, 24
+; RV32I-NEXT:    or a3, a3, a6
+; RV32I-NEXT:    lbu a6, 0(a1)
+; RV32I-NEXT:    lbu a7, 1(a1)
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a7, a7, 8
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a4
-; RV32I-NEXT:    or a5, a1, a5
-; RV32I-NEXT:    addi a4, a5, -32
-; RV32I-NEXT:    sll a1, a3, a5
-; RV32I-NEXT:    bltz a4, .LBB4_2
+; RV32I-NEXT:    or a1, a1, a5
+; RV32I-NEXT:    or a5, a4, a3
+; RV32I-NEXT:    or a4, a1, a6
+; RV32I-NEXT:    addi a3, a4, -32
+; RV32I-NEXT:    sll a1, a5, a4
+; RV32I-NEXT:    bltz a3, .LBB4_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a1
 ; RV32I-NEXT:    j .LBB4_3
@@ -422,29 +422,29 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli t0, t0, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    not a7, a4
+; RV32I-NEXT:    srli a5, a5, 1
 ; RV32I-NEXT:    or a0, a0, a6
-; RV32I-NEXT:    sll a0, a0, a5
-; RV32I-NEXT:    not a5, a5
-; RV32I-NEXT:    srli a3, a3, 1
-; RV32I-NEXT:    srl a3, a3, a5
-; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    sll a0, a0, a4
+; RV32I-NEXT:    srl a4, a5, a7
+; RV32I-NEXT:    or a0, a0, a4
 ; RV32I-NEXT:  .LBB4_3:
-; RV32I-NEXT:    srai a4, a4, 31
-; RV32I-NEXT:    and a1, a4, a1
+; RV32I-NEXT:    srai a3, a3, 31
+; RV32I-NEXT:    srli a4, a0, 16
+; RV32I-NEXT:    srli a5, a0, 24
+; RV32I-NEXT:    and a1, a3, a1
 ; RV32I-NEXT:    srli a3, a1, 16
-; RV32I-NEXT:    srli a4, a1, 24
-; RV32I-NEXT:    srli a5, a1, 8
+; RV32I-NEXT:    srli a6, a1, 24
+; RV32I-NEXT:    srli a7, a1, 8
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a5, 1(a2)
+; RV32I-NEXT:    sb a7, 1(a2)
 ; RV32I-NEXT:    sb a3, 2(a2)
-; RV32I-NEXT:    sb a4, 3(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    sb a6, 3(a2)
+; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    sb a0, 4(a2)
-; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    sb a1, 6(a2)
-; RV32I-NEXT:    sb a3, 7(a2)
+; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    sb a4, 6(a2)
+; RV32I-NEXT:    sb a5, 7(a2)
 ; RV32I-NEXT:    ret
   %src = load i64, ptr %src.ptr, align 1
   %bitOff = load i64, ptr %bitOff.ptr, align 1
@@ -455,66 +455,66 @@ define void @shl_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_8bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t2, 3(a1)
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a4
-; RV64I-NEXT:    or a0, a0, a5
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    lbu a4, 0(a1)
-; RV64I-NEXT:    lbu a5, 1(a1)
-; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    lbu a3, 2(a1)
-; RV64I-NEXT:    lbu a6, 3(a1)
-; RV64I-NEXT:    slli a5, a5, 8
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    slli a3, a3, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a6, a3
-; RV64I-NEXT:    lbu a5, 4(a1)
-; RV64I-NEXT:    lbu a6, 5(a1)
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    lbu a4, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
+; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a0, a0, t1
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t1, 5(a1)
+; RV64I-NEXT:    or t0, t2, t0
+; RV64I-NEXT:    lbu t2, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or a7, t1, a7
+; RV64I-NEXT:    slli t2, t2, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a4
-; RV64I-NEXT:    or a1, a1, a5
+; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    or a4, t0, a6
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a1, a1, a4
 ; RV64I-NEXT:    sra a0, a0, a1
 ; RV64I-NEXT:    srli a1, a0, 48
 ; RV64I-NEXT:    srli a3, a0, 56
 ; RV64I-NEXT:    srli a4, a0, 32
 ; RV64I-NEXT:    srli a5, a0, 40
+; RV64I-NEXT:    srli a6, a0, 16
+; RV64I-NEXT:    srli a7, a0, 24
+; RV64I-NEXT:    srli t0, a0, 8
 ; RV64I-NEXT:    sb a4, 4(a2)
 ; RV64I-NEXT:    sb a5, 5(a2)
 ; RV64I-NEXT:    sb a1, 6(a2)
 ; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a1, a0, 16
-; RV64I-NEXT:    srli a3, a0, 24
-; RV64I-NEXT:    srli a4, a0, 8
 ; RV64I-NEXT:    sb a0, 0(a2)
-; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    sb a1, 2(a2)
-; RV64I-NEXT:    sb a3, 3(a2)
+; RV64I-NEXT:    sb t0, 1(a2)
+; RV64I-NEXT:    sb a6, 2(a2)
+; RV64I-NEXT:    sb a7, 3(a2)
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_8bytes:
@@ -524,60 +524,60 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a5, 6(a0)
 ; RV32I-NEXT:    lbu a6, 7(a0)
 ; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    lbu a7, 0(a1)
+; RV32I-NEXT:    lbu t0, 1(a1)
 ; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a4, a6, 24
-; RV32I-NEXT:    or a5, a4, a5
-; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu a7, 1(a1)
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    lbu a5, 2(a1)
+; RV32I-NEXT:    lbu a4, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    slli a4, a4, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a5
-; RV32I-NEXT:    or a5, a1, a6
-; RV32I-NEXT:    addi a6, a5, -32
-; RV32I-NEXT:    sra a1, a3, a5
+; RV32I-NEXT:    or a1, a1, a4
+; RV32I-NEXT:    slli a4, a5, 16
+; RV32I-NEXT:    slli a5, a6, 24
+; RV32I-NEXT:    or a4, a5, a4
+; RV32I-NEXT:    or a4, a4, a3
+; RV32I-NEXT:    or a3, a1, a7
+; RV32I-NEXT:    addi a6, a3, -32
+; RV32I-NEXT:    sra a1, a4, a3
 ; RV32I-NEXT:    bltz a6, .LBB5_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    srai a4, a4, 31
+; RV32I-NEXT:    srai a5, a5, 31
 ; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:    mv a1, a4
+; RV32I-NEXT:    mv a1, a5
 ; RV32I-NEXT:    j .LBB5_3
 ; RV32I-NEXT:  .LBB5_2:
-; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a5, 1(a0)
 ; RV32I-NEXT:    lbu a6, 0(a0)
 ; RV32I-NEXT:    lbu a7, 2(a0)
 ; RV32I-NEXT:    lbu a0, 3(a0)
-; RV32I-NEXT:    slli a4, a4, 8
-; RV32I-NEXT:    or a4, a4, a6
+; RV32I-NEXT:    slli a5, a5, 8
+; RV32I-NEXT:    or a5, a5, a6
 ; RV32I-NEXT:    slli a7, a7, 16
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    or a0, a0, a7
-; RV32I-NEXT:    or a0, a0, a4
-; RV32I-NEXT:    srl a0, a0, a5
-; RV32I-NEXT:    not a4, a5
-; RV32I-NEXT:    slli a3, a3, 1
-; RV32I-NEXT:    sll a3, a3, a4
+; RV32I-NEXT:    not a6, a3
+; RV32I-NEXT:    slli a4, a4, 1
+; RV32I-NEXT:    or a0, a0, a5
+; RV32I-NEXT:    srl a0, a0, a3
+; RV32I-NEXT:    sll a3, a4, a6
 ; RV32I-NEXT:    or a0, a0, a3
 ; RV32I-NEXT:  .LBB5_3:
 ; RV32I-NEXT:    srli a3, a1, 16
 ; RV32I-NEXT:    srli a4, a1, 24
 ; RV32I-NEXT:    srli a5, a1, 8
+; RV32I-NEXT:    srli a6, a0, 16
+; RV32I-NEXT:    srli a7, a0, 24
 ; RV32I-NEXT:    sb a1, 4(a2)
 ; RV32I-NEXT:    sb a5, 5(a2)
 ; RV32I-NEXT:    sb a3, 6(a2)
 ; RV32I-NEXT:    sb a4, 7(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    srli a3, a0, 24
-; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    srli a1, a0, 8
 ; RV32I-NEXT:    sb a0, 0(a2)
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    sb a1, 2(a2)
-; RV32I-NEXT:    sb a3, 3(a2)
+; RV32I-NEXT:    sb a1, 1(a2)
+; RV32I-NEXT:    sb a6, 2(a2)
+; RV32I-NEXT:    sb a7, 3(a2)
 ; RV32I-NEXT:    ret
   %src = load i64, ptr %src.ptr, align 1
   %bitOff = load i64, ptr %bitOff.ptr, align 1
@@ -589,231 +589,231 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 13(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 2(a1)
-; RV64I-NEXT:    lbu a7, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu a7, 5(a1)
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    lbu a5, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t3, 3(a1)
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t2, 5(a1)
+; RV64I-NEXT:    or t0, t3, t0
+; RV64I-NEXT:    lbu t3, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t3
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, t1, a5
+; RV64I-NEXT:    or a6, t0, a6
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a5, a1, a4
-; RV64I-NEXT:    addi a4, a5, -64
-; RV64I-NEXT:    srl a1, a3, a5
-; RV64I-NEXT:    bltz a4, .LBB6_2
+; RV64I-NEXT:    or a5, a4, a3
+; RV64I-NEXT:    or a4, a1, a6
+; RV64I-NEXT:    addi a3, a4, -64
+; RV64I-NEXT:    srl a1, a5, a4
+; RV64I-NEXT:    bltz a3, .LBB6_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB6_3
 ; RV64I-NEXT:  .LBB6_2:
 ; RV64I-NEXT:    lbu a6, 1(a0)
-; RV64I-NEXT:    lbu a7, 0(a0)
-; RV64I-NEXT:    lbu t0, 2(a0)
-; RV64I-NEXT:    lbu t1, 3(a0)
+; RV64I-NEXT:    lbu a7, 2(a0)
+; RV64I-NEXT:    lbu t0, 3(a0)
+; RV64I-NEXT:    lbu t1, 0(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    lbu t0, 4(a0)
-; RV64I-NEXT:    lbu t1, 5(a0)
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
 ; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, a6, t1
+; RV64I-NEXT:    lbu t1, 4(a0)
+; RV64I-NEXT:    lbu t2, 5(a0)
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    lbu t0, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    not a7, a4
+; RV64I-NEXT:    slli a5, a5, 1
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    srl a0, a0, a5
-; RV64I-NEXT:    not a5, a5
-; RV64I-NEXT:    slli a3, a3, 1
-; RV64I-NEXT:    sll a3, a3, a5
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    srl a0, a0, a4
+; RV64I-NEXT:    sll a4, a5, a7
+; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:  .LBB6_3:
-; RV64I-NEXT:    srai a4, a4, 63
-; RV64I-NEXT:    and a1, a4, a1
+; RV64I-NEXT:    srai a3, a3, 63
+; RV64I-NEXT:    srli a4, a0, 56
+; RV64I-NEXT:    srli a5, a0, 48
+; RV64I-NEXT:    srli a6, a0, 40
+; RV64I-NEXT:    srli a7, a0, 32
+; RV64I-NEXT:    srli t0, a0, 24
+; RV64I-NEXT:    srli t1, a0, 16
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    sb a7, 4(a2)
+; RV64I-NEXT:    sb a6, 5(a2)
+; RV64I-NEXT:    sb a5, 6(a2)
+; RV64I-NEXT:    sb a4, 7(a2)
 ; RV64I-NEXT:    srli a3, a1, 56
 ; RV64I-NEXT:    srli a4, a1, 48
 ; RV64I-NEXT:    srli a5, a1, 40
 ; RV64I-NEXT:    srli a6, a1, 32
+; RV64I-NEXT:    srli a7, a1, 24
+; RV64I-NEXT:    srli t2, a1, 16
 ; RV64I-NEXT:    sb a6, 12(a2)
 ; RV64I-NEXT:    sb a5, 13(a2)
 ; RV64I-NEXT:    sb a4, 14(a2)
 ; RV64I-NEXT:    sb a3, 15(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a5, a1, 8
+; RV64I-NEXT:    srli a3, a1, 8
 ; RV64I-NEXT:    sb a1, 8(a2)
-; RV64I-NEXT:    sb a5, 9(a2)
-; RV64I-NEXT:    sb a4, 10(a2)
-; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 4(a2)
-; RV64I-NEXT:    sb a4, 5(a2)
-; RV64I-NEXT:    sb a3, 6(a2)
-; RV64I-NEXT:    sb a1, 7(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    srli a3, a0, 16
-; RV64I-NEXT:    srli a4, a0, 8
+; RV64I-NEXT:    sb a3, 9(a2)
+; RV64I-NEXT:    sb t2, 10(a2)
+; RV64I-NEXT:    sb a7, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    sb a0, 0(a2)
-; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    sb a3, 2(a2)
-; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    sb a1, 1(a2)
+; RV64I-NEXT:    sb t1, 2(a2)
+; RV64I-NEXT:    sb t0, 3(a2)
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_16bytes:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 9(a0)
+; RV32I-NEXT:    lbu t3, 10(a0)
+; RV32I-NEXT:    lbu t4, 11(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, t2, t1
 ; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
+; RV32I-NEXT:    lbu a6, 12(a0)
+; RV32I-NEXT:    lbu t1, 13(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a6
-; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu t0, 1(a1)
-; RV32I-NEXT:    or a0, a0, a7
-; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    or a6, t1, a6
+; RV32I-NEXT:    lbu t1, 0(a1)
+; RV32I-NEXT:    lbu t4, 1(a1)
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu t2, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a6, t0, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a7
-; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t1, t4, t1
 ; RV32I-NEXT:    sw zero, 16(sp)
 ; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t2
+; RV32I-NEXT:    mv t2, sp
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, t0, a7
+; RV32I-NEXT:    or a5, t3, a5
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    or a1, a1, t1
 ; RV32I-NEXT:    sw a3, 0(sp)
 ; RV32I-NEXT:    sw a4, 4(sp)
 ; RV32I-NEXT:    sw a5, 8(sp)
 ; RV32I-NEXT:    sw a0, 12(sp)
 ; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a3, a1, 31
 ; RV32I-NEXT:    andi a0, a0, 12
-; RV32I-NEXT:    mv a3, sp
-; RV32I-NEXT:    add a0, a3, a0
-; RV32I-NEXT:    lw a3, 4(a0)
+; RV32I-NEXT:    add a0, t2, a0
 ; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
+; RV32I-NEXT:    lw a6, 8(a0)
+; RV32I-NEXT:    xori a3, a3, 31
 ; RV32I-NEXT:    lw a0, 12(a0)
-; RV32I-NEXT:    srl a6, a3, a1
-; RV32I-NEXT:    andi a7, a1, 31
-; RV32I-NEXT:    xori a7, a7, 31
-; RV32I-NEXT:    slli t0, a5, 1
-; RV32I-NEXT:    sll t0, t0, a7
-; RV32I-NEXT:    or a6, a6, t0
+; RV32I-NEXT:    srl a7, a5, a1
+; RV32I-NEXT:    slli t0, a6, 1
 ; RV32I-NEXT:    srl a4, a4, a1
-; RV32I-NEXT:    slli a3, a3, 1
-; RV32I-NEXT:    sll a3, a3, a7
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    srl a4, a5, a1
-; RV32I-NEXT:    slli a5, a0, 1
-; RV32I-NEXT:    sll a5, a5, a7
-; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a5, a5, 1
+; RV32I-NEXT:    srl a6, a6, a1
+; RV32I-NEXT:    slli t1, a0, 1
 ; RV32I-NEXT:    srl a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    srli a5, a0, 24
-; RV32I-NEXT:    srli a7, a0, 8
+; RV32I-NEXT:    sll a1, t0, a3
+; RV32I-NEXT:    sll a5, a5, a3
+; RV32I-NEXT:    sll a3, t1, a3
+; RV32I-NEXT:    srli t0, a0, 16
+; RV32I-NEXT:    srli t1, a0, 24
+; RV32I-NEXT:    srli t2, a0, 8
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    or a3, a6, a3
 ; RV32I-NEXT:    sb a0, 12(a2)
-; RV32I-NEXT:    sb a7, 13(a2)
-; RV32I-NEXT:    sb a1, 14(a2)
-; RV32I-NEXT:    sb a5, 15(a2)
-; RV32I-NEXT:    srli a0, a4, 16
-; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    srli a5, a4, 8
-; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb a5, 9(a2)
-; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    sb a1, 11(a2)
+; RV32I-NEXT:    sb t2, 13(a2)
+; RV32I-NEXT:    sb t0, 14(a2)
+; RV32I-NEXT:    sb t1, 15(a2)
 ; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    srli a4, a3, 8
-; RV32I-NEXT:    sb a3, 0(a2)
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    sb a0, 2(a2)
-; RV32I-NEXT:    sb a1, 3(a2)
-; RV32I-NEXT:    srli a0, a6, 16
-; RV32I-NEXT:    srli a1, a6, 24
-; RV32I-NEXT:    srli a3, a6, 8
-; RV32I-NEXT:    sb a6, 4(a2)
-; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a5, a3, 24
+; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    srli a7, a4, 16
+; RV32I-NEXT:    srli t0, a4, 24
+; RV32I-NEXT:    srli t1, a4, 8
+; RV32I-NEXT:    srli t2, a1, 16
+; RV32I-NEXT:    srli t3, a1, 24
+; RV32I-NEXT:    sb a3, 8(a2)
+; RV32I-NEXT:    sb a6, 9(a2)
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    sb a5, 11(a2)
+; RV32I-NEXT:    srli a0, a1, 8
+; RV32I-NEXT:    sb a4, 0(a2)
+; RV32I-NEXT:    sb t1, 1(a2)
+; RV32I-NEXT:    sb a7, 2(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    sb a1, 4(a2)
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    sb t2, 6(a2)
+; RV32I-NEXT:    sb t3, 7(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -825,231 +825,231 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 0(a1)
-; RV64I-NEXT:    lbu a6, 1(a1)
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 2(a1)
-; RV64I-NEXT:    lbu a7, 3(a1)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 4(a1)
-; RV64I-NEXT:    lbu a7, 5(a1)
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    lbu a5, 6(a1)
-; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    lbu a6, 0(a1)
+; RV64I-NEXT:    lbu a7, 1(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t3, 3(a1)
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    lbu a7, 4(a1)
+; RV64I-NEXT:    lbu t2, 5(a1)
+; RV64I-NEXT:    or t0, t3, t0
+; RV64I-NEXT:    lbu t3, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a5
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t3
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, t1, a5
+; RV64I-NEXT:    or a6, t0, a6
+; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a5, a1, a4
-; RV64I-NEXT:    addi a4, a5, -64
-; RV64I-NEXT:    sll a1, a3, a5
-; RV64I-NEXT:    bltz a4, .LBB7_2
+; RV64I-NEXT:    or a5, a4, a3
+; RV64I-NEXT:    or a4, a1, a6
+; RV64I-NEXT:    addi a3, a4, -64
+; RV64I-NEXT:    sll a1, a5, a4
+; RV64I-NEXT:    bltz a3, .LBB7_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    j .LBB7_3
 ; RV64I-NEXT:  .LBB7_2:
 ; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    lbu a7, 8(a0)
-; RV64I-NEXT:    lbu t0, 10(a0)
-; RV64I-NEXT:    lbu t1, 11(a0)
+; RV64I-NEXT:    lbu a7, 10(a0)
+; RV64I-NEXT:    lbu t0, 11(a0)
+; RV64I-NEXT:    lbu t1, 8(a0)
 ; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli t0, t0, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a7, t1, t0
-; RV64I-NEXT:    lbu t0, 12(a0)
-; RV64I-NEXT:    lbu t1, 13(a0)
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 14(a0)
-; RV64I-NEXT:    lbu a0, 15(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
 ; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t0, t0, 24
+; RV64I-NEXT:    or a6, a6, t1
+; RV64I-NEXT:    lbu t1, 12(a0)
+; RV64I-NEXT:    lbu t2, 13(a0)
+; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    lbu t0, 14(a0)
+; RV64I-NEXT:    lbu a0, 15(a0)
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or t1, t2, t1
+; RV64I-NEXT:    slli t0, t0, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
 ; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    not a7, a4
+; RV64I-NEXT:    srli a5, a5, 1
+; RV64I-NEXT:    or a0, a0, t1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    sll a0, a0, a5
-; RV64I-NEXT:    not a5, a5
-; RV64I-NEXT:    srli a3, a3, 1
-; RV64I-NEXT:    srl a3, a3, a5
-; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    sll a0, a0, a4
+; RV64I-NEXT:    srl a4, a5, a7
+; RV64I-NEXT:    or a0, a0, a4
 ; RV64I-NEXT:  .LBB7_3:
-; RV64I-NEXT:    srai a4, a4, 63
-; RV64I-NEXT:    and a1, a4, a1
+; RV64I-NEXT:    srai a3, a3, 63
+; RV64I-NEXT:    srli a4, a0, 56
+; RV64I-NEXT:    srli a5, a0, 48
+; RV64I-NEXT:    srli a6, a0, 40
+; RV64I-NEXT:    srli a7, a0, 32
+; RV64I-NEXT:    srli t0, a0, 24
+; RV64I-NEXT:    srli t1, a0, 16
+; RV64I-NEXT:    and a1, a3, a1
+; RV64I-NEXT:    sb a7, 12(a2)
+; RV64I-NEXT:    sb a6, 13(a2)
+; RV64I-NEXT:    sb a5, 14(a2)
+; RV64I-NEXT:    sb a4, 15(a2)
 ; RV64I-NEXT:    srli a3, a1, 56
 ; RV64I-NEXT:    srli a4, a1, 48
 ; RV64I-NEXT:    srli a5, a1, 40
 ; RV64I-NEXT:    srli a6, a1, 32
+; RV64I-NEXT:    srli a7, a1, 24
+; RV64I-NEXT:    srli t2, a1, 16
 ; RV64I-NEXT:    sb a6, 4(a2)
 ; RV64I-NEXT:    sb a5, 5(a2)
 ; RV64I-NEXT:    sb a4, 6(a2)
 ; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a5, a1, 8
+; RV64I-NEXT:    srli a3, a1, 8
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb a5, 1(a2)
-; RV64I-NEXT:    sb a4, 2(a2)
-; RV64I-NEXT:    sb a3, 3(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 12(a2)
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    sb a3, 14(a2)
-; RV64I-NEXT:    sb a1, 15(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    srli a3, a0, 16
-; RV64I-NEXT:    srli a4, a0, 8
+; RV64I-NEXT:    sb a3, 1(a2)
+; RV64I-NEXT:    sb t2, 2(a2)
+; RV64I-NEXT:    sb a7, 3(a2)
+; RV64I-NEXT:    srli a1, a0, 8
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    sb a4, 9(a2)
-; RV64I-NEXT:    sb a3, 10(a2)
-; RV64I-NEXT:    sb a1, 11(a2)
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    sb t1, 10(a2)
+; RV64I-NEXT:    sb t0, 11(a2)
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_16bytes:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 9(a0)
+; RV32I-NEXT:    lbu t3, 10(a0)
+; RV32I-NEXT:    lbu t4, 11(a0)
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, t2, t1
 ; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
+; RV32I-NEXT:    lbu a6, 12(a0)
+; RV32I-NEXT:    lbu t1, 13(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli t2, t2, 16
 ; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a6
-; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu t0, 1(a1)
-; RV32I-NEXT:    or a0, a0, a7
-; RV32I-NEXT:    lbu a7, 2(a1)
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    or a6, t1, a6
+; RV32I-NEXT:    lbu t1, 0(a1)
+; RV32I-NEXT:    lbu t4, 1(a1)
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu t2, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a6, t0, a6
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a7
-; RV32I-NEXT:    or a1, a1, a6
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    or t1, t4, t1
 ; RV32I-NEXT:    sw zero, 0(sp)
 ; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t2
+; RV32I-NEXT:    addi t2, sp, 16
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, t0, a7
+; RV32I-NEXT:    or a5, t3, a5
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    or a1, a1, t1
 ; RV32I-NEXT:    sw a3, 16(sp)
 ; RV32I-NEXT:    sw a4, 20(sp)
 ; RV32I-NEXT:    sw a5, 24(sp)
 ; RV32I-NEXT:    sw a0, 28(sp)
 ; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a3, a1, 31
 ; RV32I-NEXT:    andi a0, a0, 12
-; RV32I-NEXT:    addi a3, sp, 16
-; RV32I-NEXT:    sub a3, a3, a0
-; RV32I-NEXT:    lw a0, 4(a3)
-; RV32I-NEXT:    lw a4, 0(a3)
-; RV32I-NEXT:    lw a5, 8(a3)
-; RV32I-NEXT:    lw a3, 12(a3)
-; RV32I-NEXT:    sll a6, a0, a1
-; RV32I-NEXT:    andi a7, a1, 31
-; RV32I-NEXT:    xori a7, a7, 31
+; RV32I-NEXT:    sub a0, t2, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
+; RV32I-NEXT:    lw a6, 8(a0)
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    xori a3, a3, 31
+; RV32I-NEXT:    sll a7, a5, a1
 ; RV32I-NEXT:    srli t0, a4, 1
-; RV32I-NEXT:    srl t0, t0, a7
-; RV32I-NEXT:    or a6, a6, t0
-; RV32I-NEXT:    sll a3, a3, a1
-; RV32I-NEXT:    srli t0, a5, 1
-; RV32I-NEXT:    srl t0, t0, a7
-; RV32I-NEXT:    or a3, a3, t0
-; RV32I-NEXT:    sll a5, a5, a1
-; RV32I-NEXT:    srli a0, a0, 1
-; RV32I-NEXT:    srl a0, a0, a7
-; RV32I-NEXT:    or a0, a5, a0
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    srli t1, a6, 1
+; RV32I-NEXT:    sll a6, a6, a1
+; RV32I-NEXT:    srli a5, a5, 1
 ; RV32I-NEXT:    sll a1, a4, a1
-; RV32I-NEXT:    srli a4, a1, 16
-; RV32I-NEXT:    srli a5, a1, 24
-; RV32I-NEXT:    srli a7, a1, 8
+; RV32I-NEXT:    srl a4, t0, a3
+; RV32I-NEXT:    srl t0, t1, a3
+; RV32I-NEXT:    srl a3, a5, a3
+; RV32I-NEXT:    srli a5, a1, 16
+; RV32I-NEXT:    srli t1, a1, 24
+; RV32I-NEXT:    srli t2, a1, 8
+; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    or a3, a6, a3
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a7, 1(a2)
-; RV32I-NEXT:    sb a4, 2(a2)
-; RV32I-NEXT:    sb a5, 3(a2)
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    srli a4, a0, 24
-; RV32I-NEXT:    srli a5, a0, 8
-; RV32I-NEXT:    sb a0, 8(a2)
-; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    sb t2, 1(a2)
+; RV32I-NEXT:    sb a5, 2(a2)
+; RV32I-NEXT:    sb t1, 3(a2)
+; RV32I-NEXT:    srli a1, a3, 16
+; RV32I-NEXT:    srli a5, a3, 24
+; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    srli a7, a0, 16
+; RV32I-NEXT:    srli t0, a0, 24
+; RV32I-NEXT:    srli t1, a0, 8
+; RV32I-NEXT:    srli t2, a4, 16
+; RV32I-NEXT:    srli t3, a4, 24
+; RV32I-NEXT:    sb a3, 8(a2)
+; RV32I-NEXT:    sb a6, 9(a2)
 ; RV32I-NEXT:    sb a1, 10(a2)
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    srli a4, a3, 8
-; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a4, 13(a2)
-; RV32I-NEXT:    sb a0, 14(a2)
-; RV32I-NEXT:    sb a1, 15(a2)
-; RV32I-NEXT:    srli a0, a6, 16
-; RV32I-NEXT:    srli a1, a6, 24
-; RV32I-NEXT:    srli a3, a6, 8
-; RV32I-NEXT:    sb a6, 4(a2)
-; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    sb a5, 11(a2)
+; RV32I-NEXT:    srli a1, a4, 8
+; RV32I-NEXT:    sb a0, 12(a2)
+; RV32I-NEXT:    sb t1, 13(a2)
+; RV32I-NEXT:    sb a7, 14(a2)
+; RV32I-NEXT:    sb t0, 15(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    sb a1, 5(a2)
+; RV32I-NEXT:    sb t2, 6(a2)
+; RV32I-NEXT:    sb t3, 7(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -1061,232 +1061,232 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_16bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lbu a3, 9(a0)
-; RV64I-NEXT:    lbu a4, 8(a0)
+; RV64I-NEXT:    lbu a3, 8(a0)
+; RV64I-NEXT:    lbu a4, 9(a0)
 ; RV64I-NEXT:    lbu a5, 10(a0)
 ; RV64I-NEXT:    lbu a6, 11(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 12(a0)
+; RV64I-NEXT:    lbu t0, 13(a0)
+; RV64I-NEXT:    lbu t1, 14(a0)
+; RV64I-NEXT:    lbu t2, 15(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 12(a0)
-; RV64I-NEXT:    lbu a6, 13(a0)
+; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 14(a0)
-; RV64I-NEXT:    lbu a7, 15(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a5, a4, 32
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
 ; RV64I-NEXT:    lbu a6, 0(a1)
 ; RV64I-NEXT:    lbu a7, 1(a1)
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    lbu a5, 2(a1)
-; RV64I-NEXT:    lbu t0, 3(a1)
+; RV64I-NEXT:    lbu t0, 2(a1)
+; RV64I-NEXT:    lbu t3, 3(a1)
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
 ; RV64I-NEXT:    slli a7, a7, 8
+; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli t3, t3, 24
+; RV64I-NEXT:    or t1, t2, t1
 ; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
 ; RV64I-NEXT:    lbu a7, 4(a1)
-; RV64I-NEXT:    lbu t0, 5(a1)
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    lbu a6, 6(a1)
+; RV64I-NEXT:    lbu t2, 5(a1)
+; RV64I-NEXT:    or t0, t3, t0
+; RV64I-NEXT:    lbu t3, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli t2, t2, 8
+; RV64I-NEXT:    or a7, t2, a7
+; RV64I-NEXT:    slli t3, t3, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a6
+; RV64I-NEXT:    or a1, a1, t3
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a5, t1, a5
+; RV64I-NEXT:    or a6, t0, a6
 ; RV64I-NEXT:    or a1, a1, a7
+; RV64I-NEXT:    slli a4, a5, 32
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a5, a1, a5
-; RV64I-NEXT:    addi a6, a5, -64
-; RV64I-NEXT:    sra a1, a3, a5
+; RV64I-NEXT:    or a4, a4, a3
+; RV64I-NEXT:    or a3, a1, a6
+; RV64I-NEXT:    addi a6, a3, -64
+; RV64I-NEXT:    sra a1, a4, a3
 ; RV64I-NEXT:    bltz a6, .LBB8_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    sraiw a3, a4, 31
+; RV64I-NEXT:    sraiw a3, a5, 31
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    mv a1, a3
 ; RV64I-NEXT:    j .LBB8_3
 ; RV64I-NEXT:  .LBB8_2:
-; RV64I-NEXT:    lbu a4, 1(a0)
-; RV64I-NEXT:    lbu a6, 0(a0)
-; RV64I-NEXT:    lbu a7, 2(a0)
-; RV64I-NEXT:    lbu t0, 3(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    or a4, a4, a6
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a6, t0, a7
-; RV64I-NEXT:    lbu a7, 4(a0)
-; RV64I-NEXT:    lbu t0, 5(a0)
-; RV64I-NEXT:    or a4, a6, a4
-; RV64I-NEXT:    lbu a6, 6(a0)
-; RV64I-NEXT:    lbu a0, 7(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
+; RV64I-NEXT:    lbu a5, 1(a0)
+; RV64I-NEXT:    lbu a6, 2(a0)
+; RV64I-NEXT:    lbu a7, 3(a0)
+; RV64I-NEXT:    lbu t0, 0(a0)
+; RV64I-NEXT:    slli a5, a5, 8
 ; RV64I-NEXT:    slli a6, a6, 16
+; RV64I-NEXT:    slli a7, a7, 24
+; RV64I-NEXT:    or a5, a5, t0
+; RV64I-NEXT:    lbu t0, 4(a0)
+; RV64I-NEXT:    lbu t1, 5(a0)
+; RV64I-NEXT:    or a6, a7, a6
+; RV64I-NEXT:    lbu a7, 6(a0)
+; RV64I-NEXT:    lbu a0, 7(a0)
+; RV64I-NEXT:    slli t1, t1, 8
+; RV64I-NEXT:    or t0, t1, t0
+; RV64I-NEXT:    slli a7, a7, 16
 ; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a6
 ; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    not a6, a3
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    or a0, a0, t0
 ; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    or a0, a0, a4
-; RV64I-NEXT:    srl a0, a0, a5
-; RV64I-NEXT:    not a4, a5
-; RV64I-NEXT:    slli a3, a3, 1
-; RV64I-NEXT:    sll a3, a3, a4
+; RV64I-NEXT:    or a0, a0, a5
+; RV64I-NEXT:    srl a0, a0, a3
+; RV64I-NEXT:    sll a3, a4, a6
 ; RV64I-NEXT:    or a0, a0, a3
 ; RV64I-NEXT:  .LBB8_3:
 ; RV64I-NEXT:    srli a3, a1, 56
 ; RV64I-NEXT:    srli a4, a1, 48
 ; RV64I-NEXT:    srli a5, a1, 40
 ; RV64I-NEXT:    srli a6, a1, 32
+; RV64I-NEXT:    srli a7, a1, 24
+; RV64I-NEXT:    srli t0, a1, 16
+; RV64I-NEXT:    srli t1, a1, 8
+; RV64I-NEXT:    srli t2, a0, 56
+; RV64I-NEXT:    srli t3, a0, 48
+; RV64I-NEXT:    srli t4, a0, 40
+; RV64I-NEXT:    srli t5, a0, 32
 ; RV64I-NEXT:    sb a6, 12(a2)
 ; RV64I-NEXT:    sb a5, 13(a2)
 ; RV64I-NEXT:    sb a4, 14(a2)
 ; RV64I-NEXT:    sb a3, 15(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a5, a1, 8
+; RV64I-NEXT:    srli a3, a0, 24
 ; RV64I-NEXT:    sb a1, 8(a2)
-; RV64I-NEXT:    sb a5, 9(a2)
-; RV64I-NEXT:    sb a4, 10(a2)
-; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 4(a2)
-; RV64I-NEXT:    sb a4, 5(a2)
-; RV64I-NEXT:    sb a3, 6(a2)
-; RV64I-NEXT:    sb a1, 7(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    srli a3, a0, 16
+; RV64I-NEXT:    sb t1, 9(a2)
+; RV64I-NEXT:    sb t0, 10(a2)
+; RV64I-NEXT:    sb a7, 11(a2)
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    sb t5, 4(a2)
+; RV64I-NEXT:    sb t4, 5(a2)
+; RV64I-NEXT:    sb t3, 6(a2)
+; RV64I-NEXT:    sb t2, 7(a2)
 ; RV64I-NEXT:    srli a4, a0, 8
 ; RV64I-NEXT:    sb a0, 0(a2)
 ; RV64I-NEXT:    sb a4, 1(a2)
-; RV64I-NEXT:    sb a3, 2(a2)
-; RV64I-NEXT:    sb a1, 3(a2)
+; RV64I-NEXT:    sb a1, 2(a2)
+; RV64I-NEXT:    sb a3, 3(a2)
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_16bytes:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -32
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
 ; RV32I-NEXT:    lbu a5, 2(a0)
 ; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    lbu a7, 4(a0)
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 8(a0)
+; RV32I-NEXT:    lbu t3, 9(a0)
+; RV32I-NEXT:    lbu t4, 10(a0)
+; RV32I-NEXT:    lbu t5, 11(a0)
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    slli t0, t0, 8
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
 ; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
-; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    lbu t0, 12(a0)
+; RV32I-NEXT:    lbu t1, 13(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
 ; RV32I-NEXT:    lbu a0, 15(a0)
-; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a6, a0, a6
-; RV32I-NEXT:    lbu t0, 0(a1)
-; RV32I-NEXT:    lbu t1, 1(a1)
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    lbu a7, 2(a1)
-; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
 ; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or a4, t3, a4
+; RV32I-NEXT:    or t3, t5, t4
+; RV32I-NEXT:    lbu t4, 0(a1)
+; RV32I-NEXT:    lbu t5, 1(a1)
 ; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    lbu t1, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t5, t5, 8
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a7
-; RV32I-NEXT:    or a1, a1, t0
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    mv a5, sp
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t1, a0, t2
 ; RV32I-NEXT:    srai a0, a0, 31
+; RV32I-NEXT:    or a6, a7, a6
+; RV32I-NEXT:    or a4, t3, a4
+; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a1, a1, t4
 ; RV32I-NEXT:    sw a0, 16(sp)
 ; RV32I-NEXT:    sw a0, 20(sp)
 ; RV32I-NEXT:    sw a0, 24(sp)
 ; RV32I-NEXT:    sw a0, 28(sp)
 ; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a6, 12(sp)
+; RV32I-NEXT:    sw a6, 4(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a7, 12(sp)
 ; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a3, a1, 31
 ; RV32I-NEXT:    andi a0, a0, 12
-; RV32I-NEXT:    mv a3, sp
-; RV32I-NEXT:    add a0, a3, a0
-; RV32I-NEXT:    lw a3, 4(a0)
+; RV32I-NEXT:    add a0, a5, a0
 ; RV32I-NEXT:    lw a4, 0(a0)
-; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
+; RV32I-NEXT:    lw a6, 8(a0)
+; RV32I-NEXT:    xori a3, a3, 31
 ; RV32I-NEXT:    lw a0, 12(a0)
-; RV32I-NEXT:    srl a6, a3, a1
-; RV32I-NEXT:    andi a7, a1, 31
-; RV32I-NEXT:    xori a7, a7, 31
-; RV32I-NEXT:    slli t0, a5, 1
-; RV32I-NEXT:    sll t0, t0, a7
-; RV32I-NEXT:    or a6, a6, t0
+; RV32I-NEXT:    srl a7, a5, a1
+; RV32I-NEXT:    slli t0, a6, 1
 ; RV32I-NEXT:    srl a4, a4, a1
-; RV32I-NEXT:    slli a3, a3, 1
-; RV32I-NEXT:    sll a3, a3, a7
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    srl a4, a5, a1
-; RV32I-NEXT:    slli a5, a0, 1
-; RV32I-NEXT:    sll a5, a5, a7
-; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    slli a5, a5, 1
+; RV32I-NEXT:    srl a6, a6, a1
+; RV32I-NEXT:    slli t1, a0, 1
 ; RV32I-NEXT:    sra a0, a0, a1
-; RV32I-NEXT:    srli a1, a0, 16
-; RV32I-NEXT:    srli a5, a0, 24
-; RV32I-NEXT:    srli a7, a0, 8
+; RV32I-NEXT:    sll a1, t0, a3
+; RV32I-NEXT:    sll a5, a5, a3
+; RV32I-NEXT:    sll a3, t1, a3
+; RV32I-NEXT:    srli t0, a0, 16
+; RV32I-NEXT:    srli t1, a0, 24
+; RV32I-NEXT:    srli t2, a0, 8
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    or a4, a4, a5
+; RV32I-NEXT:    or a3, a6, a3
 ; RV32I-NEXT:    sb a0, 12(a2)
-; RV32I-NEXT:    sb a7, 13(a2)
-; RV32I-NEXT:    sb a1, 14(a2)
-; RV32I-NEXT:    sb a5, 15(a2)
-; RV32I-NEXT:    srli a0, a4, 16
-; RV32I-NEXT:    srli a1, a4, 24
-; RV32I-NEXT:    srli a5, a4, 8
-; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb a5, 9(a2)
-; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    sb a1, 11(a2)
+; RV32I-NEXT:    sb t2, 13(a2)
+; RV32I-NEXT:    sb t0, 14(a2)
+; RV32I-NEXT:    sb t1, 15(a2)
 ; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    srli a1, a3, 24
-; RV32I-NEXT:    srli a4, a3, 8
-; RV32I-NEXT:    sb a3, 0(a2)
-; RV32I-NEXT:    sb a4, 1(a2)
-; RV32I-NEXT:    sb a0, 2(a2)
-; RV32I-NEXT:    sb a1, 3(a2)
-; RV32I-NEXT:    srli a0, a6, 16
-; RV32I-NEXT:    srli a1, a6, 24
-; RV32I-NEXT:    srli a3, a6, 8
-; RV32I-NEXT:    sb a6, 4(a2)
-; RV32I-NEXT:    sb a3, 5(a2)
-; RV32I-NEXT:    sb a0, 6(a2)
-; RV32I-NEXT:    sb a1, 7(a2)
+; RV32I-NEXT:    srli a5, a3, 24
+; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    srli a7, a4, 16
+; RV32I-NEXT:    srli t0, a4, 24
+; RV32I-NEXT:    srli t1, a4, 8
+; RV32I-NEXT:    srli t2, a1, 16
+; RV32I-NEXT:    srli t3, a1, 24
+; RV32I-NEXT:    sb a3, 8(a2)
+; RV32I-NEXT:    sb a6, 9(a2)
+; RV32I-NEXT:    sb a0, 10(a2)
+; RV32I-NEXT:    sb a5, 11(a2)
+; RV32I-NEXT:    srli a0, a1, 8
+; RV32I-NEXT:    sb a4, 0(a2)
+; RV32I-NEXT:    sb t1, 1(a2)
+; RV32I-NEXT:    sb a7, 2(a2)
+; RV32I-NEXT:    sb t0, 3(a2)
+; RV32I-NEXT:    sb a1, 4(a2)
+; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    sb t2, 6(a2)
+; RV32I-NEXT:    sb t3, 7(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
   %src = load i128, ptr %src.ptr, align 1
@@ -1299,420 +1299,472 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -64
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli s8, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 10(a0)
-; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    lbu a5, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 17(a0)
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 18(a0)
-; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 21(a0)
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    lbu a6, 22(a0)
-; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu t1, 27(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 29(a0)
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    lbu a7, 30(a0)
-; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 1(a1)
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    lbu a6, 2(a1)
-; RV64I-NEXT:    lbu t1, 3(a1)
 ; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    lbu t1, 5(a1)
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    lbu a7, 6(a1)
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, a4, a3
+; RV64I-NEXT:    or a6, a6, s8
+; RV64I-NEXT:    or a3, t0, a7
+; RV64I-NEXT:    or a4, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    slli s2, s2, 16
+; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t6, 24(a0)
+; RV64I-NEXT:    lbu s0, 25(a0)
+; RV64I-NEXT:    lbu s1, 26(a0)
+; RV64I-NEXT:    lbu s2, 27(a0)
+; RV64I-NEXT:    slli s5, s5, 8
+; RV64I-NEXT:    slli s6, s6, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or t3, s5, s4
+; RV64I-NEXT:    or t4, s7, s6
+; RV64I-NEXT:    or t5, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
+; RV64I-NEXT:    lbu s6, 31(a0)
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s11, s10
+; RV64I-NEXT:    or t6, s0, t6
+; RV64I-NEXT:    or s0, s2, s1
+; RV64I-NEXT:    or s1, s4, s3
+; RV64I-NEXT:    lbu s2, 0(a1)
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or s5, s6, s5
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    lbu s3, 4(a1)
+; RV64I-NEXT:    lbu s6, 5(a1)
+; RV64I-NEXT:    or s4, s7, s4
+; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli s6, s6, 8
+; RV64I-NEXT:    or s3, s6, s3
+; RV64I-NEXT:    slli s7, s7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
-; RV64I-NEXT:    or a1, a1, t0
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a6, a1, a6
+; RV64I-NEXT:    or a1, a1, s7
 ; RV64I-NEXT:    sd zero, 32(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
 ; RV64I-NEXT:    sd zero, 48(sp)
 ; RV64I-NEXT:    sd zero, 56(sp)
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    mv a6, sp
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    or a7, t2, t1
+; RV64I-NEXT:    or t0, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    or t1, s0, t6
+; RV64I-NEXT:    or t2, s5, s1
+; RV64I-NEXT:    or t3, s4, s2
+; RV64I-NEXT:    or a1, a1, s3
+; RV64I-NEXT:    slli a3, a3, 32
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    slli t2, t2, 32
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a3, a3, a5
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a5, t2, t1
+; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    sd a4, 8(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
-; RV64I-NEXT:    sd a0, 24(sp)
-; RV64I-NEXT:    srli a0, a6, 3
+; RV64I-NEXT:    sd a0, 16(sp)
+; RV64I-NEXT:    sd a5, 24(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a3, a1, 63
 ; RV64I-NEXT:    andi a0, a0, 24
-; RV64I-NEXT:    mv a1, sp
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    ld a1, 8(a0)
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    ld a4, 16(a0)
-; RV64I-NEXT:    ld a5, 24(a0)
-; RV64I-NEXT:    srl a0, a1, a6
-; RV64I-NEXT:    andi a7, a6, 63
-; RV64I-NEXT:    xori a7, a7, 63
-; RV64I-NEXT:    slli t0, a4, 1
-; RV64I-NEXT:    sll t0, t0, a7
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    srl a3, a3, a6
-; RV64I-NEXT:    slli a1, a1, 1
-; RV64I-NEXT:    sll a1, a1, a7
-; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    srl a3, a4, a6
-; RV64I-NEXT:    slli a4, a5, 1
-; RV64I-NEXT:    sll a4, a4, a7
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    srl a4, a5, a6
-; RV64I-NEXT:    srli a5, a4, 56
-; RV64I-NEXT:    srli a6, a4, 48
-; RV64I-NEXT:    srli a7, a4, 40
-; RV64I-NEXT:    srli t0, a4, 32
-; RV64I-NEXT:    sb t0, 28(a2)
-; RV64I-NEXT:    sb a7, 29(a2)
-; RV64I-NEXT:    sb a6, 30(a2)
+; RV64I-NEXT:    add a0, a6, a0
+; RV64I-NEXT:    ld a4, 0(a0)
+; RV64I-NEXT:    ld a5, 8(a0)
+; RV64I-NEXT:    ld a6, 16(a0)
+; RV64I-NEXT:    xori a3, a3, 63
+; RV64I-NEXT:    ld a0, 24(a0)
+; RV64I-NEXT:    srl a7, a5, a1
+; RV64I-NEXT:    slli t0, a6, 1
+; RV64I-NEXT:    srl a4, a4, a1
+; RV64I-NEXT:    slli a5, a5, 1
+; RV64I-NEXT:    srl a6, a6, a1
+; RV64I-NEXT:    slli t1, a0, 1
+; RV64I-NEXT:    srl t2, a0, a1
+; RV64I-NEXT:    sll a0, t0, a3
+; RV64I-NEXT:    sll a1, a5, a3
+; RV64I-NEXT:    sll a3, t1, a3
+; RV64I-NEXT:    srli a5, t2, 56
+; RV64I-NEXT:    srli t0, t2, 48
+; RV64I-NEXT:    srli t1, t2, 40
+; RV64I-NEXT:    srli t3, t2, 32
+; RV64I-NEXT:    srli t4, t2, 24
+; RV64I-NEXT:    srli t5, t2, 16
+; RV64I-NEXT:    srli t6, t2, 8
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    or a1, a4, a1
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    sb t3, 28(a2)
+; RV64I-NEXT:    sb t1, 29(a2)
+; RV64I-NEXT:    sb t0, 30(a2)
 ; RV64I-NEXT:    sb a5, 31(a2)
-; RV64I-NEXT:    srli a5, a4, 24
-; RV64I-NEXT:    srli a6, a4, 16
-; RV64I-NEXT:    srli a7, a4, 8
-; RV64I-NEXT:    sb a4, 24(a2)
-; RV64I-NEXT:    sb a7, 25(a2)
-; RV64I-NEXT:    sb a6, 26(a2)
-; RV64I-NEXT:    sb a5, 27(a2)
+; RV64I-NEXT:    sb t2, 24(a2)
+; RV64I-NEXT:    sb t6, 25(a2)
+; RV64I-NEXT:    sb t5, 26(a2)
+; RV64I-NEXT:    sb t4, 27(a2)
 ; RV64I-NEXT:    srli a4, a3, 56
 ; RV64I-NEXT:    srli a5, a3, 48
 ; RV64I-NEXT:    srli a6, a3, 40
 ; RV64I-NEXT:    srli a7, a3, 32
+; RV64I-NEXT:    srli t0, a3, 24
+; RV64I-NEXT:    srli t1, a3, 16
+; RV64I-NEXT:    srli t2, a3, 8
+; RV64I-NEXT:    srli t3, a1, 56
+; RV64I-NEXT:    srli t4, a1, 48
+; RV64I-NEXT:    srli t5, a1, 40
+; RV64I-NEXT:    srli t6, a1, 32
+; RV64I-NEXT:    srli s0, a1, 24
+; RV64I-NEXT:    srli s1, a1, 16
+; RV64I-NEXT:    srli s2, a1, 8
+; RV64I-NEXT:    srli s3, a0, 56
+; RV64I-NEXT:    srli s4, a0, 48
+; RV64I-NEXT:    srli s5, a0, 40
 ; RV64I-NEXT:    sb a7, 20(a2)
 ; RV64I-NEXT:    sb a6, 21(a2)
 ; RV64I-NEXT:    sb a5, 22(a2)
 ; RV64I-NEXT:    sb a4, 23(a2)
-; RV64I-NEXT:    srli a4, a3, 24
-; RV64I-NEXT:    srli a5, a3, 16
-; RV64I-NEXT:    srli a6, a3, 8
+; RV64I-NEXT:    srli a4, a0, 32
 ; RV64I-NEXT:    sb a3, 16(a2)
-; RV64I-NEXT:    sb a6, 17(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    srli a3, a1, 56
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    srli a6, a1, 32
-; RV64I-NEXT:    sb a6, 4(a2)
-; RV64I-NEXT:    sb a5, 5(a2)
-; RV64I-NEXT:    sb a4, 6(a2)
-; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a5, a1, 8
+; RV64I-NEXT:    sb t2, 17(a2)
+; RV64I-NEXT:    sb t1, 18(a2)
+; RV64I-NEXT:    sb t0, 19(a2)
+; RV64I-NEXT:    srli a3, a0, 24
+; RV64I-NEXT:    sb t6, 4(a2)
+; RV64I-NEXT:    sb t5, 5(a2)
+; RV64I-NEXT:    sb t4, 6(a2)
+; RV64I-NEXT:    sb t3, 7(a2)
+; RV64I-NEXT:    srli a5, a0, 16
 ; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb a5, 1(a2)
-; RV64I-NEXT:    sb a4, 2(a2)
-; RV64I-NEXT:    sb a3, 3(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 12(a2)
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    sb a3, 14(a2)
-; RV64I-NEXT:    sb a1, 15(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    srli a3, a0, 16
-; RV64I-NEXT:    srli a4, a0, 8
+; RV64I-NEXT:    sb s2, 1(a2)
+; RV64I-NEXT:    sb s1, 2(a2)
+; RV64I-NEXT:    sb s0, 3(a2)
+; RV64I-NEXT:    srli a1, a0, 8
+; RV64I-NEXT:    sb a4, 12(a2)
+; RV64I-NEXT:    sb s5, 13(a2)
+; RV64I-NEXT:    sb s4, 14(a2)
+; RV64I-NEXT:    sb s3, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    sb a4, 9(a2)
-; RV64I-NEXT:    sb a3, 10(a2)
-; RV64I-NEXT:    sb a1, 11(a2)
-; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    sb a5, 10(a2)
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a6, 2(a0)
+; RV32I-NEXT:    lbu a7, 3(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s2, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    or a4, a7, a6
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    lbu ra, 22(a0)
+; RV32I-NEXT:    lbu a3, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a6, t1, a6
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 17(a0)
-; RV32I-NEXT:    or a7, a6, a7
-; RV32I-NEXT:    lbu a6, 18(a0)
-; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a6, t2, a6
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    or t0, a6, t0
-; RV32I-NEXT:    lbu a6, 22(a0)
-; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or a6, t3, a6
-; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 25(a0)
-; RV32I-NEXT:    or t1, a6, t1
-; RV32I-NEXT:    lbu a6, 26(a0)
-; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    or t2, t3, t2
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a6, t4, a6
-; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 29(a0)
-; RV32I-NEXT:    or t2, a6, t2
-; RV32I-NEXT:    lbu a6, 30(a0)
-; RV32I-NEXT:    lbu a0, 31(a0)
 ; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a6
-; RV32I-NEXT:    lbu a6, 0(a1)
-; RV32I-NEXT:    lbu t4, 1(a1)
-; RV32I-NEXT:    or a0, a0, t3
-; RV32I-NEXT:    lbu t3, 2(a1)
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    lbu s1, 24(a0)
+; RV32I-NEXT:    lbu s3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    or t3, s7, s6
+; RV32I-NEXT:    lbu t6, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
+; RV32I-NEXT:    lbu s6, 31(a0)
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, s9, s8
+; RV32I-NEXT:    or s0, s11, s10
+; RV32I-NEXT:    or s2, a3, ra
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu s7, 1(a1)
+; RV32I-NEXT:    lbu s8, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or a6, t4, a6
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t3
-; RV32I-NEXT:    or a6, a1, a6
-; RV32I-NEXT:    sw zero, 48(sp)
-; RV32I-NEXT:    sw zero, 52(sp)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
-; RV32I-NEXT:    sw zero, 32(sp)
-; RV32I-NEXT:    sw zero, 36(sp)
+; RV32I-NEXT:    sw zero, 64(sp)
+; RV32I-NEXT:    sw zero, 68(sp)
 ; RV32I-NEXT:    sw zero, 40(sp)
 ; RV32I-NEXT:    sw zero, 44(sp)
-; RV32I-NEXT:    sw t0, 16(sp)
-; RV32I-NEXT:    sw t1, 20(sp)
-; RV32I-NEXT:    sw t2, 24(sp)
-; RV32I-NEXT:    sw a0, 28(sp)
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a7, 12(sp)
-; RV32I-NEXT:    srli a0, a6, 3
-; RV32I-NEXT:    andi a0, a0, 28
-; RV32I-NEXT:    mv a1, sp
-; RV32I-NEXT:    add a3, a1, a0
-; RV32I-NEXT:    lw a1, 4(a3)
-; RV32I-NEXT:    lw a4, 0(a3)
-; RV32I-NEXT:    lw a5, 8(a3)
-; RV32I-NEXT:    lw a7, 12(a3)
-; RV32I-NEXT:    srl a0, a1, a6
-; RV32I-NEXT:    andi t0, a6, 31
-; RV32I-NEXT:    xori t0, t0, 31
-; RV32I-NEXT:    slli t1, a5, 1
-; RV32I-NEXT:    sll t1, t1, t0
-; RV32I-NEXT:    or a0, a0, t1
-; RV32I-NEXT:    srl a4, a4, a6
-; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    sll a1, a1, t0
-; RV32I-NEXT:    or a1, a4, a1
-; RV32I-NEXT:    srl a4, a7, a6
-; RV32I-NEXT:    lw t1, 16(a3)
-; RV32I-NEXT:    lw t2, 20(a3)
-; RV32I-NEXT:    lw t3, 24(a3)
-; RV32I-NEXT:    lw t4, 28(a3)
-; RV32I-NEXT:    slli a3, t1, 1
-; RV32I-NEXT:    sll a3, a3, t0
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    srl a4, a5, a6
+; RV32I-NEXT:    sw zero, 48(sp)
+; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s1, s3, s1
+; RV32I-NEXT:    addi s3, sp, 8
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t5, s4, t6
+; RV32I-NEXT:    or t6, s6, s5
+; RV32I-NEXT:    or a3, s7, a3
+; RV32I-NEXT:    or a1, a1, s8
+; RV32I-NEXT:    lw s4, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, s4
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or t0, a0, t3
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, t4, s1
+; RV32I-NEXT:    or t3, t6, t5
+; RV32I-NEXT:    or a0, a1, a3
+; RV32I-NEXT:    sw t0, 24(sp)
+; RV32I-NEXT:    sw t1, 28(sp)
+; RV32I-NEXT:    sw t2, 32(sp)
+; RV32I-NEXT:    sw t3, 36(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    sw a7, 20(sp)
+; RV32I-NEXT:    srli a1, a0, 3
+; RV32I-NEXT:    andi a3, a0, 31
+; RV32I-NEXT:    andi a4, a1, 28
+; RV32I-NEXT:    xori a1, a3, 31
+; RV32I-NEXT:    add a4, s3, a4
+; RV32I-NEXT:    lw a3, 0(a4)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a6, 8(a4)
+; RV32I-NEXT:    lw a7, 12(a4)
+; RV32I-NEXT:    lw t0, 16(a4)
+; RV32I-NEXT:    lw t1, 20(a4)
+; RV32I-NEXT:    lw t2, 24(a4)
+; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    srl t3, a5, a0
+; RV32I-NEXT:    slli t4, a6, 1
+; RV32I-NEXT:    srl a3, a3, a0
+; RV32I-NEXT:    slli a5, a5, 1
+; RV32I-NEXT:    srl t5, a7, a0
+; RV32I-NEXT:    slli t6, t0, 1
+; RV32I-NEXT:    srl a6, a6, a0
 ; RV32I-NEXT:    slli a7, a7, 1
-; RV32I-NEXT:    sll a5, a7, t0
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    srl a5, t2, a6
-; RV32I-NEXT:    slli a7, t3, 1
-; RV32I-NEXT:    sll a7, a7, t0
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    srl a7, t1, a6
-; RV32I-NEXT:    slli t2, t2, 1
-; RV32I-NEXT:    sll t1, t2, t0
-; RV32I-NEXT:    or a7, a7, t1
-; RV32I-NEXT:    srl t1, t3, a6
-; RV32I-NEXT:    slli t2, t4, 1
-; RV32I-NEXT:    sll t0, t2, t0
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    srl a6, t4, a6
-; RV32I-NEXT:    srli t1, a6, 24
-; RV32I-NEXT:    srli t2, a6, 16
-; RV32I-NEXT:    srli t3, a6, 8
-; RV32I-NEXT:    sb a6, 28(a2)
-; RV32I-NEXT:    sb t3, 29(a2)
-; RV32I-NEXT:    sb t2, 30(a2)
-; RV32I-NEXT:    sb t1, 31(a2)
-; RV32I-NEXT:    srli a6, t0, 24
-; RV32I-NEXT:    srli t1, t0, 16
-; RV32I-NEXT:    srli t2, t0, 8
-; RV32I-NEXT:    sb t0, 24(a2)
+; RV32I-NEXT:    srl s0, t1, a0
+; RV32I-NEXT:    slli s1, t2, 1
+; RV32I-NEXT:    srl t0, t0, a0
+; RV32I-NEXT:    slli t1, t1, 1
+; RV32I-NEXT:    srl t2, t2, a0
+; RV32I-NEXT:    slli s2, a4, 1
+; RV32I-NEXT:    srl s3, a4, a0
+; RV32I-NEXT:    sll a0, t4, a1
+; RV32I-NEXT:    sll a4, a5, a1
+; RV32I-NEXT:    sll a5, t6, a1
+; RV32I-NEXT:    sll a7, a7, a1
+; RV32I-NEXT:    sll t4, s1, a1
+; RV32I-NEXT:    sll t1, t1, a1
+; RV32I-NEXT:    sll t6, s2, a1
+; RV32I-NEXT:    srli s1, s3, 24
+; RV32I-NEXT:    srli s2, s3, 16
+; RV32I-NEXT:    srli s4, s3, 8
+; RV32I-NEXT:    or a0, t3, a0
+; RV32I-NEXT:    or a1, a3, a4
+; RV32I-NEXT:    or a3, t5, a5
+; RV32I-NEXT:    or a4, a6, a7
+; RV32I-NEXT:    or a5, s0, t4
+; RV32I-NEXT:    or a6, t0, t1
+; RV32I-NEXT:    or a7, t2, t6
+; RV32I-NEXT:    sb s3, 28(a2)
+; RV32I-NEXT:    sb s4, 29(a2)
+; RV32I-NEXT:    sb s2, 30(a2)
+; RV32I-NEXT:    sb s1, 31(a2)
+; RV32I-NEXT:    srli t0, a7, 24
+; RV32I-NEXT:    srli t1, a7, 16
+; RV32I-NEXT:    srli t2, a7, 8
+; RV32I-NEXT:    srli t3, a6, 24
+; RV32I-NEXT:    srli t4, a6, 16
+; RV32I-NEXT:    srli t5, a6, 8
+; RV32I-NEXT:    srli t6, a5, 24
+; RV32I-NEXT:    srli s0, a5, 16
+; RV32I-NEXT:    srli s1, a5, 8
+; RV32I-NEXT:    srli s2, a4, 24
+; RV32I-NEXT:    srli s3, a4, 16
+; RV32I-NEXT:    srli s4, a4, 8
+; RV32I-NEXT:    srli s5, a3, 24
+; RV32I-NEXT:    srli s6, a3, 16
+; RV32I-NEXT:    srli s7, a3, 8
+; RV32I-NEXT:    srli s8, a1, 24
+; RV32I-NEXT:    srli s9, a1, 16
+; RV32I-NEXT:    sb a7, 24(a2)
 ; RV32I-NEXT:    sb t2, 25(a2)
 ; RV32I-NEXT:    sb t1, 26(a2)
-; RV32I-NEXT:    sb a6, 27(a2)
-; RV32I-NEXT:    srli a6, a7, 24
-; RV32I-NEXT:    srli t0, a7, 16
-; RV32I-NEXT:    srli t1, a7, 8
-; RV32I-NEXT:    sb a7, 16(a2)
-; RV32I-NEXT:    sb t1, 17(a2)
-; RV32I-NEXT:    sb t0, 18(a2)
-; RV32I-NEXT:    sb a6, 19(a2)
-; RV32I-NEXT:    srli a6, a5, 24
-; RV32I-NEXT:    srli a7, a5, 16
-; RV32I-NEXT:    srli t0, a5, 8
+; RV32I-NEXT:    sb t0, 27(a2)
+; RV32I-NEXT:    srli a7, a1, 8
+; RV32I-NEXT:    sb a6, 16(a2)
+; RV32I-NEXT:    sb t5, 17(a2)
+; RV32I-NEXT:    sb t4, 18(a2)
+; RV32I-NEXT:    sb t3, 19(a2)
+; RV32I-NEXT:    srli a6, a0, 24
 ; RV32I-NEXT:    sb a5, 20(a2)
-; RV32I-NEXT:    sb t0, 21(a2)
-; RV32I-NEXT:    sb a7, 22(a2)
-; RV32I-NEXT:    sb a6, 23(a2)
-; RV32I-NEXT:    srli a5, a4, 24
-; RV32I-NEXT:    srli a6, a4, 16
-; RV32I-NEXT:    srli a7, a4, 8
+; RV32I-NEXT:    sb s1, 21(a2)
+; RV32I-NEXT:    sb s0, 22(a2)
+; RV32I-NEXT:    sb t6, 23(a2)
+; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb a7, 9(a2)
-; RV32I-NEXT:    sb a6, 10(a2)
-; RV32I-NEXT:    sb a5, 11(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    srli a5, a3, 16
-; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    sb s4, 9(a2)
+; RV32I-NEXT:    sb s3, 10(a2)
+; RV32I-NEXT:    sb s2, 11(a2)
+; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a6, 13(a2)
-; RV32I-NEXT:    sb a5, 14(a2)
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    srli a4, a1, 16
-; RV32I-NEXT:    srli a5, a1, 8
+; RV32I-NEXT:    sb s7, 13(a2)
+; RV32I-NEXT:    sb s6, 14(a2)
+; RV32I-NEXT:    sb s5, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    sb a4, 2(a2)
-; RV32I-NEXT:    sb a3, 3(a2)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    sb a7, 1(a2)
+; RV32I-NEXT:    sb s9, 2(a2)
+; RV32I-NEXT:    sb s8, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    sb a5, 6(a2)
+; RV32I-NEXT:    sb a6, 7(a2)
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -1723,420 +1775,472 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -64
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli s8, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 10(a0)
-; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    lbu a5, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 17(a0)
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 18(a0)
-; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 21(a0)
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    lbu a6, 22(a0)
-; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu t1, 27(a0)
 ; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 29(a0)
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    lbu a7, 30(a0)
-; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    lbu a7, 0(a1)
-; RV64I-NEXT:    lbu t0, 1(a1)
-; RV64I-NEXT:    or a0, a0, a6
-; RV64I-NEXT:    lbu a6, 2(a1)
-; RV64I-NEXT:    lbu t1, 3(a1)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu t0, 4(a1)
-; RV64I-NEXT:    lbu t1, 5(a1)
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    lbu a7, 6(a1)
+; RV64I-NEXT:    slli t1, t1, 16
+; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, a4, a3
+; RV64I-NEXT:    or a6, a6, s8
+; RV64I-NEXT:    or a3, t0, a7
+; RV64I-NEXT:    or a4, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    slli s2, s2, 16
+; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t6, 24(a0)
+; RV64I-NEXT:    lbu s0, 25(a0)
+; RV64I-NEXT:    lbu s1, 26(a0)
+; RV64I-NEXT:    lbu s2, 27(a0)
+; RV64I-NEXT:    slli s5, s5, 8
+; RV64I-NEXT:    slli s6, s6, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or t3, s5, s4
+; RV64I-NEXT:    or t4, s7, s6
+; RV64I-NEXT:    or t5, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
+; RV64I-NEXT:    lbu s6, 31(a0)
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s11, s10
+; RV64I-NEXT:    or t6, s0, t6
+; RV64I-NEXT:    or s0, s2, s1
+; RV64I-NEXT:    or s1, s4, s3
+; RV64I-NEXT:    lbu s2, 0(a1)
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or s5, s6, s5
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    lbu s3, 4(a1)
+; RV64I-NEXT:    lbu s6, 5(a1)
+; RV64I-NEXT:    or s4, s7, s4
+; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli s6, s6, 8
+; RV64I-NEXT:    or s3, s6, s3
+; RV64I-NEXT:    slli s7, s7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, a7
-; RV64I-NEXT:    or a1, a1, t0
-; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a6, a1, a6
+; RV64I-NEXT:    or a1, a1, s7
 ; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
 ; RV64I-NEXT:    sd zero, 24(sp)
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    addi a6, sp, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    or a7, t2, t1
+; RV64I-NEXT:    or t0, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    or t1, s0, t6
+; RV64I-NEXT:    or t2, s5, s1
+; RV64I-NEXT:    or t3, s4, s2
+; RV64I-NEXT:    or a1, a1, s3
+; RV64I-NEXT:    slli a3, a3, 32
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    slli t2, t2, 32
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a3, a3, a5
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a5, t2, t1
+; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    sd a3, 32(sp)
 ; RV64I-NEXT:    sd a4, 40(sp)
-; RV64I-NEXT:    sd a5, 48(sp)
-; RV64I-NEXT:    sd a0, 56(sp)
-; RV64I-NEXT:    srli a0, a6, 3
+; RV64I-NEXT:    sd a0, 48(sp)
+; RV64I-NEXT:    sd a5, 56(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a3, a1, 63
 ; RV64I-NEXT:    andi a0, a0, 24
-; RV64I-NEXT:    addi a1, sp, 32
-; RV64I-NEXT:    sub a1, a1, a0
-; RV64I-NEXT:    ld a3, 8(a1)
-; RV64I-NEXT:    ld a4, 0(a1)
-; RV64I-NEXT:    ld a5, 16(a1)
-; RV64I-NEXT:    ld a1, 24(a1)
-; RV64I-NEXT:    sll a0, a3, a6
-; RV64I-NEXT:    andi a7, a6, 63
-; RV64I-NEXT:    xori a7, a7, 63
+; RV64I-NEXT:    sub a0, a6, a0
+; RV64I-NEXT:    ld a4, 0(a0)
+; RV64I-NEXT:    ld a5, 8(a0)
+; RV64I-NEXT:    ld a6, 16(a0)
+; RV64I-NEXT:    ld a0, 24(a0)
+; RV64I-NEXT:    xori a3, a3, 63
+; RV64I-NEXT:    sll a7, a5, a1
 ; RV64I-NEXT:    srli t0, a4, 1
-; RV64I-NEXT:    srl t0, t0, a7
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    sll a1, a1, a6
-; RV64I-NEXT:    srli t0, a5, 1
-; RV64I-NEXT:    srl t0, t0, a7
-; RV64I-NEXT:    or a1, a1, t0
-; RV64I-NEXT:    sll a5, a5, a6
-; RV64I-NEXT:    srli a3, a3, 1
-; RV64I-NEXT:    srl a3, a3, a7
-; RV64I-NEXT:    or a3, a5, a3
-; RV64I-NEXT:    sll a4, a4, a6
+; RV64I-NEXT:    sll t1, a0, a1
+; RV64I-NEXT:    srli a0, a6, 1
+; RV64I-NEXT:    sll a6, a6, a1
+; RV64I-NEXT:    srli a5, a5, 1
+; RV64I-NEXT:    sll a4, a4, a1
+; RV64I-NEXT:    srl a1, t0, a3
+; RV64I-NEXT:    srl t0, a0, a3
+; RV64I-NEXT:    srl a3, a5, a3
 ; RV64I-NEXT:    srli a5, a4, 56
-; RV64I-NEXT:    srli a6, a4, 48
-; RV64I-NEXT:    srli a7, a4, 40
-; RV64I-NEXT:    srli t0, a4, 32
-; RV64I-NEXT:    sb t0, 4(a2)
-; RV64I-NEXT:    sb a7, 5(a2)
-; RV64I-NEXT:    sb a6, 6(a2)
+; RV64I-NEXT:    srli t2, a4, 48
+; RV64I-NEXT:    srli t3, a4, 40
+; RV64I-NEXT:    srli t4, a4, 32
+; RV64I-NEXT:    srli t5, a4, 24
+; RV64I-NEXT:    srli t6, a4, 16
+; RV64I-NEXT:    srli s0, a4, 8
+; RV64I-NEXT:    or a0, a7, a1
+; RV64I-NEXT:    or a1, t1, t0
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    sb t4, 4(a2)
+; RV64I-NEXT:    sb t3, 5(a2)
+; RV64I-NEXT:    sb t2, 6(a2)
 ; RV64I-NEXT:    sb a5, 7(a2)
-; RV64I-NEXT:    srli a5, a4, 24
-; RV64I-NEXT:    srli a6, a4, 16
-; RV64I-NEXT:    srli a7, a4, 8
 ; RV64I-NEXT:    sb a4, 0(a2)
-; RV64I-NEXT:    sb a7, 1(a2)
-; RV64I-NEXT:    sb a6, 2(a2)
-; RV64I-NEXT:    sb a5, 3(a2)
+; RV64I-NEXT:    sb s0, 1(a2)
+; RV64I-NEXT:    sb t6, 2(a2)
+; RV64I-NEXT:    sb t5, 3(a2)
 ; RV64I-NEXT:    srli a4, a3, 56
 ; RV64I-NEXT:    srli a5, a3, 48
 ; RV64I-NEXT:    srli a6, a3, 40
 ; RV64I-NEXT:    srli a7, a3, 32
+; RV64I-NEXT:    srli t0, a3, 24
+; RV64I-NEXT:    srli t1, a3, 16
+; RV64I-NEXT:    srli t2, a3, 8
+; RV64I-NEXT:    srli t3, a1, 56
+; RV64I-NEXT:    srli t4, a1, 48
+; RV64I-NEXT:    srli t5, a1, 40
+; RV64I-NEXT:    srli t6, a1, 32
+; RV64I-NEXT:    srli s0, a1, 24
+; RV64I-NEXT:    srli s1, a1, 16
+; RV64I-NEXT:    srli s2, a1, 8
+; RV64I-NEXT:    srli s3, a0, 56
+; RV64I-NEXT:    srli s4, a0, 48
+; RV64I-NEXT:    srli s5, a0, 40
 ; RV64I-NEXT:    sb a7, 20(a2)
 ; RV64I-NEXT:    sb a6, 21(a2)
 ; RV64I-NEXT:    sb a5, 22(a2)
 ; RV64I-NEXT:    sb a4, 23(a2)
-; RV64I-NEXT:    srli a4, a3, 24
-; RV64I-NEXT:    srli a5, a3, 16
-; RV64I-NEXT:    srli a6, a3, 8
+; RV64I-NEXT:    srli a4, a0, 32
 ; RV64I-NEXT:    sb a3, 16(a2)
-; RV64I-NEXT:    sb a6, 17(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    srli a3, a1, 56
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    srli a6, a1, 32
-; RV64I-NEXT:    sb a6, 28(a2)
-; RV64I-NEXT:    sb a5, 29(a2)
-; RV64I-NEXT:    sb a4, 30(a2)
-; RV64I-NEXT:    sb a3, 31(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a5, a1, 8
+; RV64I-NEXT:    sb t2, 17(a2)
+; RV64I-NEXT:    sb t1, 18(a2)
+; RV64I-NEXT:    sb t0, 19(a2)
+; RV64I-NEXT:    srli a3, a0, 24
+; RV64I-NEXT:    sb t6, 28(a2)
+; RV64I-NEXT:    sb t5, 29(a2)
+; RV64I-NEXT:    sb t4, 30(a2)
+; RV64I-NEXT:    sb t3, 31(a2)
+; RV64I-NEXT:    srli a5, a0, 16
 ; RV64I-NEXT:    sb a1, 24(a2)
-; RV64I-NEXT:    sb a5, 25(a2)
-; RV64I-NEXT:    sb a4, 26(a2)
-; RV64I-NEXT:    sb a3, 27(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 12(a2)
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    sb a3, 14(a2)
-; RV64I-NEXT:    sb a1, 15(a2)
-; RV64I-NEXT:    srli a1, a0, 24
-; RV64I-NEXT:    srli a3, a0, 16
-; RV64I-NEXT:    srli a4, a0, 8
+; RV64I-NEXT:    sb s2, 25(a2)
+; RV64I-NEXT:    sb s1, 26(a2)
+; RV64I-NEXT:    sb s0, 27(a2)
+; RV64I-NEXT:    srli a1, a0, 8
+; RV64I-NEXT:    sb a4, 12(a2)
+; RV64I-NEXT:    sb s5, 13(a2)
+; RV64I-NEXT:    sb s4, 14(a2)
+; RV64I-NEXT:    sb s3, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    sb a4, 9(a2)
-; RV64I-NEXT:    sb a3, 10(a2)
-; RV64I-NEXT:    sb a1, 11(a2)
-; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    sb a1, 9(a2)
+; RV64I-NEXT:    sb a5, 10(a2)
+; RV64I-NEXT:    sb a3, 11(a2)
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a6, 2(a0)
+; RV32I-NEXT:    lbu a7, 3(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s2, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    or a4, a7, a6
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    lbu ra, 22(a0)
+; RV32I-NEXT:    lbu a3, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a6, t1, a6
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 17(a0)
-; RV32I-NEXT:    or a6, a6, a7
-; RV32I-NEXT:    lbu a7, 18(a0)
-; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    slli a7, a7, 16
+; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a7, t2, a7
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    or t0, a7, t0
-; RV32I-NEXT:    lbu a7, 22(a0)
-; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or a7, t3, a7
-; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 25(a0)
-; RV32I-NEXT:    or t1, a7, t1
-; RV32I-NEXT:    lbu a7, 26(a0)
-; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    or t2, t3, t2
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a7, t4, a7
-; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 29(a0)
-; RV32I-NEXT:    or t2, a7, t2
-; RV32I-NEXT:    lbu a7, 30(a0)
-; RV32I-NEXT:    lbu a0, 31(a0)
 ; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    slli a7, a7, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a0, a0, a7
-; RV32I-NEXT:    lbu a7, 0(a1)
-; RV32I-NEXT:    lbu t4, 1(a1)
-; RV32I-NEXT:    or a0, a0, t3
-; RV32I-NEXT:    lbu t3, 2(a1)
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    lbu s1, 24(a0)
+; RV32I-NEXT:    lbu s3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
+; RV32I-NEXT:    slli s2, s2, 8
+; RV32I-NEXT:    slli s4, s4, 16
+; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    or t3, s7, s6
+; RV32I-NEXT:    lbu t6, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
+; RV32I-NEXT:    lbu s6, 31(a0)
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, s9, s8
+; RV32I-NEXT:    or s0, s11, s10
+; RV32I-NEXT:    or s2, a3, ra
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu s7, 1(a1)
+; RV32I-NEXT:    lbu s8, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or a7, t4, a7
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, t3
-; RV32I-NEXT:    or a7, a1, a7
-; RV32I-NEXT:    sw zero, 16(sp)
-; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    sw zero, 0(sp)
-; RV32I-NEXT:    sw zero, 4(sp)
+; RV32I-NEXT:    sw zero, 32(sp)
+; RV32I-NEXT:    sw zero, 36(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    sw t0, 48(sp)
-; RV32I-NEXT:    sw t1, 52(sp)
-; RV32I-NEXT:    sw t2, 56(sp)
-; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a3, 32(sp)
-; RV32I-NEXT:    sw a4, 36(sp)
-; RV32I-NEXT:    sw a5, 40(sp)
-; RV32I-NEXT:    sw a6, 44(sp)
-; RV32I-NEXT:    srli a0, a7, 3
-; RV32I-NEXT:    andi a0, a0, 28
-; RV32I-NEXT:    addi a1, sp, 32
-; RV32I-NEXT:    sub a4, a1, a0
-; RV32I-NEXT:    lw a3, 4(a4)
-; RV32I-NEXT:    lw a5, 0(a4)
-; RV32I-NEXT:    lw a6, 8(a4)
-; RV32I-NEXT:    lw t0, 12(a4)
-; RV32I-NEXT:    sll a0, a3, a7
-; RV32I-NEXT:    andi a1, a7, 31
-; RV32I-NEXT:    xori t1, a1, 31
-; RV32I-NEXT:    srli a1, a5, 1
-; RV32I-NEXT:    srl a1, a1, t1
-; RV32I-NEXT:    or a0, a0, a1
-; RV32I-NEXT:    sll a1, t0, a7
-; RV32I-NEXT:    srli t2, a6, 1
-; RV32I-NEXT:    srl t2, t2, t1
-; RV32I-NEXT:    or a1, a1, t2
-; RV32I-NEXT:    sll a6, a6, a7
-; RV32I-NEXT:    srli a3, a3, 1
-; RV32I-NEXT:    srl a3, a3, t1
-; RV32I-NEXT:    lw t2, 16(a4)
-; RV32I-NEXT:    lw t3, 20(a4)
-; RV32I-NEXT:    or a3, a6, a3
-; RV32I-NEXT:    lw a6, 24(a4)
-; RV32I-NEXT:    lw t4, 28(a4)
-; RV32I-NEXT:    sll a4, t3, a7
-; RV32I-NEXT:    srli t5, t2, 1
-; RV32I-NEXT:    srl t5, t5, t1
-; RV32I-NEXT:    or a4, a4, t5
-; RV32I-NEXT:    sll t2, t2, a7
-; RV32I-NEXT:    srli t0, t0, 1
-; RV32I-NEXT:    srl t0, t0, t1
-; RV32I-NEXT:    or t0, t2, t0
-; RV32I-NEXT:    sll t2, t4, a7
-; RV32I-NEXT:    srli t4, a6, 1
-; RV32I-NEXT:    srl t4, t4, t1
-; RV32I-NEXT:    or t2, t2, t4
-; RV32I-NEXT:    sll a6, a6, a7
-; RV32I-NEXT:    srli t3, t3, 1
-; RV32I-NEXT:    srl t1, t3, t1
-; RV32I-NEXT:    or a6, a6, t1
-; RV32I-NEXT:    sll a5, a5, a7
-; RV32I-NEXT:    srli a7, a5, 24
-; RV32I-NEXT:    srli t1, a5, 16
-; RV32I-NEXT:    srli t3, a5, 8
-; RV32I-NEXT:    sb a5, 0(a2)
-; RV32I-NEXT:    sb t3, 1(a2)
-; RV32I-NEXT:    sb t1, 2(a2)
-; RV32I-NEXT:    sb a7, 3(a2)
-; RV32I-NEXT:    srli a5, a6, 24
-; RV32I-NEXT:    srli a7, a6, 16
-; RV32I-NEXT:    srli t1, a6, 8
-; RV32I-NEXT:    sb a6, 24(a2)
-; RV32I-NEXT:    sb t1, 25(a2)
-; RV32I-NEXT:    sb a7, 26(a2)
-; RV32I-NEXT:    sb a5, 27(a2)
-; RV32I-NEXT:    srli a5, t2, 24
-; RV32I-NEXT:    srli a6, t2, 16
-; RV32I-NEXT:    srli a7, t2, 8
-; RV32I-NEXT:    sb t2, 28(a2)
-; RV32I-NEXT:    sb a7, 29(a2)
-; RV32I-NEXT:    sb a6, 30(a2)
-; RV32I-NEXT:    sb a5, 31(a2)
-; RV32I-NEXT:    srli a5, t0, 24
-; RV32I-NEXT:    srli a6, t0, 16
-; RV32I-NEXT:    srli a7, t0, 8
-; RV32I-NEXT:    sb t0, 16(a2)
-; RV32I-NEXT:    sb a7, 17(a2)
-; RV32I-NEXT:    sb a6, 18(a2)
-; RV32I-NEXT:    sb a5, 19(a2)
-; RV32I-NEXT:    srli a5, a4, 24
-; RV32I-NEXT:    srli a6, a4, 16
-; RV32I-NEXT:    srli a7, a4, 8
+; RV32I-NEXT:    sw zero, 16(sp)
+; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s1, s3, s1
+; RV32I-NEXT:    addi s3, sp, 40
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t5, s4, t6
+; RV32I-NEXT:    or t6, s6, s5
+; RV32I-NEXT:    or a3, s7, a3
+; RV32I-NEXT:    or a1, a1, s8
+; RV32I-NEXT:    lw s4, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, s4
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or t0, a0, t3
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, t4, s1
+; RV32I-NEXT:    or t3, t6, t5
+; RV32I-NEXT:    or a0, a1, a3
+; RV32I-NEXT:    sw t0, 56(sp)
+; RV32I-NEXT:    sw t1, 60(sp)
+; RV32I-NEXT:    sw t2, 64(sp)
+; RV32I-NEXT:    sw t3, 68(sp)
+; RV32I-NEXT:    sw a4, 40(sp)
+; RV32I-NEXT:    sw a5, 44(sp)
+; RV32I-NEXT:    sw a6, 48(sp)
+; RV32I-NEXT:    sw a7, 52(sp)
+; RV32I-NEXT:    srli a1, a0, 3
+; RV32I-NEXT:    andi a3, a0, 31
+; RV32I-NEXT:    andi a4, a1, 28
+; RV32I-NEXT:    xori a1, a3, 31
+; RV32I-NEXT:    sub a3, s3, a4
+; RV32I-NEXT:    lw a4, 0(a3)
+; RV32I-NEXT:    lw a5, 4(a3)
+; RV32I-NEXT:    lw a6, 8(a3)
+; RV32I-NEXT:    lw a7, 12(a3)
+; RV32I-NEXT:    lw t0, 16(a3)
+; RV32I-NEXT:    lw t1, 20(a3)
+; RV32I-NEXT:    lw t2, 24(a3)
+; RV32I-NEXT:    lw a3, 28(a3)
+; RV32I-NEXT:    sll t3, a5, a0
+; RV32I-NEXT:    srli t4, a4, 1
+; RV32I-NEXT:    sll t5, a7, a0
+; RV32I-NEXT:    srli t6, a6, 1
+; RV32I-NEXT:    sll a6, a6, a0
+; RV32I-NEXT:    srli a5, a5, 1
+; RV32I-NEXT:    sll s0, t1, a0
+; RV32I-NEXT:    srli s1, t0, 1
+; RV32I-NEXT:    sll t0, t0, a0
+; RV32I-NEXT:    srli a7, a7, 1
+; RV32I-NEXT:    sll s2, a3, a0
+; RV32I-NEXT:    srli a3, t2, 1
+; RV32I-NEXT:    sll t2, t2, a0
+; RV32I-NEXT:    srli t1, t1, 1
+; RV32I-NEXT:    sll s3, a4, a0
+; RV32I-NEXT:    srl a0, t4, a1
+; RV32I-NEXT:    srl a4, t6, a1
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    srl t4, s1, a1
+; RV32I-NEXT:    srl a7, a7, a1
+; RV32I-NEXT:    srl t6, a3, a1
+; RV32I-NEXT:    srl t1, t1, a1
+; RV32I-NEXT:    srli s1, s3, 24
+; RV32I-NEXT:    srli s4, s3, 16
+; RV32I-NEXT:    srli s5, s3, 8
+; RV32I-NEXT:    or a0, t3, a0
+; RV32I-NEXT:    or a1, t5, a4
+; RV32I-NEXT:    or a3, a6, a5
+; RV32I-NEXT:    or a4, s0, t4
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, s2, t6
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    sb s3, 0(a2)
+; RV32I-NEXT:    sb s5, 1(a2)
+; RV32I-NEXT:    sb s4, 2(a2)
+; RV32I-NEXT:    sb s1, 3(a2)
+; RV32I-NEXT:    srli t0, a7, 24
+; RV32I-NEXT:    srli t1, a7, 16
+; RV32I-NEXT:    srli t2, a7, 8
+; RV32I-NEXT:    srli t3, a6, 24
+; RV32I-NEXT:    srli t4, a6, 16
+; RV32I-NEXT:    srli t5, a6, 8
+; RV32I-NEXT:    srli t6, a5, 24
+; RV32I-NEXT:    srli s0, a5, 16
+; RV32I-NEXT:    srli s1, a5, 8
+; RV32I-NEXT:    srli s2, a4, 24
+; RV32I-NEXT:    srli s3, a4, 16
+; RV32I-NEXT:    srli s4, a4, 8
+; RV32I-NEXT:    srli s5, a3, 24
+; RV32I-NEXT:    srli s6, a3, 16
+; RV32I-NEXT:    srli s7, a3, 8
+; RV32I-NEXT:    srli s8, a1, 24
+; RV32I-NEXT:    srli s9, a1, 16
+; RV32I-NEXT:    sb a7, 24(a2)
+; RV32I-NEXT:    sb t2, 25(a2)
+; RV32I-NEXT:    sb t1, 26(a2)
+; RV32I-NEXT:    sb t0, 27(a2)
+; RV32I-NEXT:    srli a7, a1, 8
+; RV32I-NEXT:    sb a6, 28(a2)
+; RV32I-NEXT:    sb t5, 29(a2)
+; RV32I-NEXT:    sb t4, 30(a2)
+; RV32I-NEXT:    sb t3, 31(a2)
+; RV32I-NEXT:    srli a6, a0, 24
+; RV32I-NEXT:    sb a5, 16(a2)
+; RV32I-NEXT:    sb s1, 17(a2)
+; RV32I-NEXT:    sb s0, 18(a2)
+; RV32I-NEXT:    sb t6, 19(a2)
+; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 20(a2)
-; RV32I-NEXT:    sb a7, 21(a2)
-; RV32I-NEXT:    sb a6, 22(a2)
-; RV32I-NEXT:    sb a5, 23(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    srli a5, a3, 16
-; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    sb s4, 21(a2)
+; RV32I-NEXT:    sb s3, 22(a2)
+; RV32I-NEXT:    sb s2, 23(a2)
+; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 8(a2)
-; RV32I-NEXT:    sb a6, 9(a2)
-; RV32I-NEXT:    sb a5, 10(a2)
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    srli a4, a1, 16
-; RV32I-NEXT:    srli a5, a1, 8
+; RV32I-NEXT:    sb s7, 9(a2)
+; RV32I-NEXT:    sb s6, 10(a2)
+; RV32I-NEXT:    sb s5, 11(a2)
 ; RV32I-NEXT:    sb a1, 12(a2)
-; RV32I-NEXT:    sb a5, 13(a2)
-; RV32I-NEXT:    sb a4, 14(a2)
-; RV32I-NEXT:    sb a3, 15(a2)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    sb a7, 13(a2)
+; RV32I-NEXT:    sb s9, 14(a2)
+; RV32I-NEXT:    sb s8, 15(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    sb a5, 6(a2)
+; RV32I-NEXT:    sb a6, 7(a2)
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1
@@ -2147,422 +2251,474 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -64
-; RV64I-NEXT:    lbu a3, 1(a0)
-; RV64I-NEXT:    lbu a4, 0(a0)
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lbu a3, 0(a0)
+; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
 ; RV64I-NEXT:    lbu a6, 3(a0)
-; RV64I-NEXT:    slli a3, a3, 8
-; RV64I-NEXT:    or a3, a3, a4
+; RV64I-NEXT:    lbu a7, 4(a0)
+; RV64I-NEXT:    lbu t0, 5(a0)
+; RV64I-NEXT:    lbu t1, 6(a0)
+; RV64I-NEXT:    lbu t2, 7(a0)
+; RV64I-NEXT:    lbu t3, 8(a0)
+; RV64I-NEXT:    lbu t4, 9(a0)
+; RV64I-NEXT:    lbu t5, 10(a0)
+; RV64I-NEXT:    lbu t6, 11(a0)
+; RV64I-NEXT:    lbu s0, 12(a0)
+; RV64I-NEXT:    lbu s1, 13(a0)
+; RV64I-NEXT:    lbu s2, 14(a0)
+; RV64I-NEXT:    lbu s3, 15(a0)
+; RV64I-NEXT:    lbu s4, 16(a0)
+; RV64I-NEXT:    lbu s5, 17(a0)
+; RV64I-NEXT:    lbu s6, 18(a0)
+; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
 ; RV64I-NEXT:    slli a5, a5, 16
 ; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    lbu a5, 4(a0)
-; RV64I-NEXT:    lbu a6, 5(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 6(a0)
-; RV64I-NEXT:    lbu a7, 7(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    lbu a5, 8(a0)
-; RV64I-NEXT:    lbu a6, 9(a0)
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    lbu a4, 10(a0)
-; RV64I-NEXT:    lbu a7, 11(a0)
-; RV64I-NEXT:    slli a6, a6, 8
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    slli a4, a4, 16
-; RV64I-NEXT:    slli a7, a7, 24
-; RV64I-NEXT:    or a4, a7, a4
-; RV64I-NEXT:    lbu a6, 12(a0)
-; RV64I-NEXT:    lbu a7, 13(a0)
-; RV64I-NEXT:    or a4, a4, a5
-; RV64I-NEXT:    lbu a5, 14(a0)
-; RV64I-NEXT:    lbu t0, 15(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    slli a5, a5, 32
-; RV64I-NEXT:    lbu a6, 16(a0)
-; RV64I-NEXT:    lbu a7, 17(a0)
-; RV64I-NEXT:    or a4, a5, a4
-; RV64I-NEXT:    lbu a5, 18(a0)
-; RV64I-NEXT:    lbu t0, 19(a0)
-; RV64I-NEXT:    slli a7, a7, 8
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli t0, t0, 24
-; RV64I-NEXT:    or a5, t0, a5
-; RV64I-NEXT:    lbu a7, 20(a0)
-; RV64I-NEXT:    lbu t0, 21(a0)
-; RV64I-NEXT:    or a5, a5, a6
-; RV64I-NEXT:    lbu a6, 22(a0)
-; RV64I-NEXT:    lbu t1, 23(a0)
-; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    lbu a7, 24(a0)
-; RV64I-NEXT:    lbu t0, 25(a0)
-; RV64I-NEXT:    or a5, a6, a5
-; RV64I-NEXT:    lbu a6, 26(a0)
-; RV64I-NEXT:    lbu t1, 27(a0)
 ; RV64I-NEXT:    slli t0, t0, 8
-; RV64I-NEXT:    or a7, t0, a7
-; RV64I-NEXT:    slli a6, a6, 16
-; RV64I-NEXT:    slli t1, t1, 24
-; RV64I-NEXT:    or a6, t1, a6
-; RV64I-NEXT:    lbu t0, 28(a0)
-; RV64I-NEXT:    lbu t1, 29(a0)
-; RV64I-NEXT:    or a6, a6, a7
-; RV64I-NEXT:    lbu a7, 30(a0)
-; RV64I-NEXT:    lbu a0, 31(a0)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
-; RV64I-NEXT:    slli a0, a0, 24
-; RV64I-NEXT:    or a0, a0, a7
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    slli a7, a0, 32
-; RV64I-NEXT:    lbu t0, 0(a1)
-; RV64I-NEXT:    lbu t1, 1(a1)
-; RV64I-NEXT:    or a6, a7, a6
-; RV64I-NEXT:    lbu a7, 2(a1)
-; RV64I-NEXT:    lbu t2, 3(a1)
-; RV64I-NEXT:    slli t1, t1, 8
-; RV64I-NEXT:    or t0, t1, t0
-; RV64I-NEXT:    slli a7, a7, 16
+; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
-; RV64I-NEXT:    or a7, t2, a7
-; RV64I-NEXT:    lbu t1, 4(a1)
-; RV64I-NEXT:    lbu t2, 5(a1)
-; RV64I-NEXT:    or a7, a7, t0
-; RV64I-NEXT:    lbu t0, 6(a1)
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
+; RV64I-NEXT:    slli t4, t4, 8
+; RV64I-NEXT:    slli t5, t5, 16
+; RV64I-NEXT:    slli t6, t6, 24
+; RV64I-NEXT:    slli s1, s1, 8
+; RV64I-NEXT:    slli s2, s2, 16
+; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t6, 24(a0)
+; RV64I-NEXT:    lbu s0, 25(a0)
+; RV64I-NEXT:    lbu s1, 26(a0)
+; RV64I-NEXT:    lbu s2, 27(a0)
+; RV64I-NEXT:    slli s5, s5, 8
+; RV64I-NEXT:    slli s6, s6, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    slli s9, s9, 8
+; RV64I-NEXT:    or t3, s5, s4
+; RV64I-NEXT:    or t4, s7, s6
+; RV64I-NEXT:    or t5, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
+; RV64I-NEXT:    lbu s4, 29(a0)
+; RV64I-NEXT:    lbu s5, 30(a0)
+; RV64I-NEXT:    lbu s6, 31(a0)
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s11, s10
+; RV64I-NEXT:    or t6, s0, t6
+; RV64I-NEXT:    or s0, s2, s1
+; RV64I-NEXT:    or s1, s4, s3
+; RV64I-NEXT:    lbu s2, 0(a1)
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or s5, s6, s5
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    lbu s3, 4(a1)
+; RV64I-NEXT:    lbu s6, 5(a1)
+; RV64I-NEXT:    or s4, s7, s4
+; RV64I-NEXT:    lbu s7, 6(a1)
 ; RV64I-NEXT:    lbu a1, 7(a1)
-; RV64I-NEXT:    slli t2, t2, 8
-; RV64I-NEXT:    or t1, t2, t1
-; RV64I-NEXT:    slli t0, t0, 16
+; RV64I-NEXT:    slli s6, s6, 8
+; RV64I-NEXT:    or s3, s6, s3
+; RV64I-NEXT:    slli s7, s7, 16
 ; RV64I-NEXT:    slli a1, a1, 24
-; RV64I-NEXT:    or a1, a1, t0
-; RV64I-NEXT:    or a1, a1, t1
+; RV64I-NEXT:    or a1, a1, s7
+; RV64I-NEXT:    mv s6, sp
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    or t0, s0, t6
+; RV64I-NEXT:    or t1, s5, s1
+; RV64I-NEXT:    or t2, s4, s2
+; RV64I-NEXT:    or a1, a1, s3
+; RV64I-NEXT:    slli a4, a4, 32
+; RV64I-NEXT:    slli a6, a6, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    slli t3, t1, 32
 ; RV64I-NEXT:    slli a1, a1, 32
-; RV64I-NEXT:    or a7, a1, a7
-; RV64I-NEXT:    sraiw a0, a0, 31
-; RV64I-NEXT:    sd a0, 32(sp)
-; RV64I-NEXT:    sd a0, 40(sp)
-; RV64I-NEXT:    sd a0, 48(sp)
-; RV64I-NEXT:    sd a0, 56(sp)
+; RV64I-NEXT:    sraiw t1, t1, 31
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a5, t3, t0
+; RV64I-NEXT:    or a1, a1, t2
+; RV64I-NEXT:    sd t1, 32(sp)
+; RV64I-NEXT:    sd t1, 40(sp)
+; RV64I-NEXT:    sd t1, 48(sp)
+; RV64I-NEXT:    sd t1, 56(sp)
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    sd a4, 8(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
-; RV64I-NEXT:    sd a6, 24(sp)
-; RV64I-NEXT:    srli a0, a7, 3
+; RV64I-NEXT:    sd a0, 16(sp)
+; RV64I-NEXT:    sd a5, 24(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a3, a1, 63
 ; RV64I-NEXT:    andi a0, a0, 24
-; RV64I-NEXT:    mv a1, sp
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    ld a1, 8(a0)
-; RV64I-NEXT:    ld a3, 0(a0)
-; RV64I-NEXT:    ld a4, 16(a0)
-; RV64I-NEXT:    ld a5, 24(a0)
-; RV64I-NEXT:    srl a0, a1, a7
-; RV64I-NEXT:    andi a6, a7, 63
-; RV64I-NEXT:    xori a6, a6, 63
-; RV64I-NEXT:    slli t0, a4, 1
-; RV64I-NEXT:    sll t0, t0, a6
-; RV64I-NEXT:    or a0, a0, t0
-; RV64I-NEXT:    srl a3, a3, a7
-; RV64I-NEXT:    slli a1, a1, 1
-; RV64I-NEXT:    sll a1, a1, a6
-; RV64I-NEXT:    or a1, a3, a1
-; RV64I-NEXT:    srl a3, a4, a7
-; RV64I-NEXT:    slli a4, a5, 1
-; RV64I-NEXT:    sll a4, a4, a6
-; RV64I-NEXT:    or a3, a3, a4
-; RV64I-NEXT:    sra a4, a5, a7
-; RV64I-NEXT:    srli a5, a4, 56
-; RV64I-NEXT:    srli a6, a4, 48
-; RV64I-NEXT:    srli a7, a4, 40
-; RV64I-NEXT:    srli t0, a4, 32
-; RV64I-NEXT:    sb t0, 28(a2)
-; RV64I-NEXT:    sb a7, 29(a2)
-; RV64I-NEXT:    sb a6, 30(a2)
+; RV64I-NEXT:    add a0, s6, a0
+; RV64I-NEXT:    ld a4, 0(a0)
+; RV64I-NEXT:    ld a5, 8(a0)
+; RV64I-NEXT:    ld a6, 16(a0)
+; RV64I-NEXT:    xori a3, a3, 63
+; RV64I-NEXT:    ld a0, 24(a0)
+; RV64I-NEXT:    srl a7, a5, a1
+; RV64I-NEXT:    slli t0, a6, 1
+; RV64I-NEXT:    srl a4, a4, a1
+; RV64I-NEXT:    slli a5, a5, 1
+; RV64I-NEXT:    srl a6, a6, a1
+; RV64I-NEXT:    slli t1, a0, 1
+; RV64I-NEXT:    sra t2, a0, a1
+; RV64I-NEXT:    sll a0, t0, a3
+; RV64I-NEXT:    sll a1, a5, a3
+; RV64I-NEXT:    sll a3, t1, a3
+; RV64I-NEXT:    srli a5, t2, 56
+; RV64I-NEXT:    srli t0, t2, 48
+; RV64I-NEXT:    srli t1, t2, 40
+; RV64I-NEXT:    srli t3, t2, 32
+; RV64I-NEXT:    srli t4, t2, 24
+; RV64I-NEXT:    srli t5, t2, 16
+; RV64I-NEXT:    srli t6, t2, 8
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    or a1, a4, a1
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    sb t3, 28(a2)
+; RV64I-NEXT:    sb t1, 29(a2)
+; RV64I-NEXT:    sb t0, 30(a2)
 ; RV64I-NEXT:    sb a5, 31(a2)
-; RV64I-NEXT:    srli a5, a4, 24
-; RV64I-NEXT:    srli a6, a4, 16
-; RV64I-NEXT:    srli a7, a4, 8
-; RV64I-NEXT:    sb a4, 24(a2)
-; RV64I-NEXT:    sb a7, 25(a2)
-; RV64I-NEXT:    sb a6, 26(a2)
-; RV64I-NEXT:    sb a5, 27(a2)
+; RV64I-NEXT:    sb t2, 24(a2)
+; RV64I-NEXT:    sb t6, 25(a2)
+; RV64I-NEXT:    sb t5, 26(a2)
+; RV64I-NEXT:    sb t4, 27(a2)
 ; RV64I-NEXT:    srli a4, a3, 56
 ; RV64I-NEXT:    srli a5, a3, 48
 ; RV64I-NEXT:    srli a6, a3, 40
 ; RV64I-NEXT:    srli a7, a3, 32
+; RV64I-NEXT:    srli t0, a3, 24
+; RV64I-NEXT:    srli t1, a3, 16
+; RV64I-NEXT:    srli t2, a3, 8
+; RV64I-NEXT:    srli t3, a1, 56
+; RV64I-NEXT:    srli t4, a1, 48
+; RV64I-NEXT:    srli t5, a1, 40
+; RV64I-NEXT:    srli t6, a1, 32
+; RV64I-NEXT:    srli s0, a1, 24
+; RV64I-NEXT:    srli s1, a1, 16
+; RV64I-NEXT:    srli s2, a1, 8
+; RV64I-NEXT:    srli s3, a0, 56
+; RV64I-NEXT:    srli s4, a0, 48
+; RV64I-NEXT:    srli s5, a0, 40
+; RV64I-NEXT:    srli s6, a0, 32
 ; RV64I-NEXT:    sb a7, 20(a2)
 ; RV64I-NEXT:    sb a6, 21(a2)
 ; RV64I-NEXT:    sb a5, 22(a2)
 ; RV64I-NEXT:    sb a4, 23(a2)
-; RV64I-NEXT:    srli a4, a3, 24
-; RV64I-NEXT:    srli a5, a3, 16
-; RV64I-NEXT:    srli a6, a3, 8
+; RV64I-NEXT:    srli a4, a0, 24
 ; RV64I-NEXT:    sb a3, 16(a2)
-; RV64I-NEXT:    sb a6, 17(a2)
-; RV64I-NEXT:    sb a5, 18(a2)
-; RV64I-NEXT:    sb a4, 19(a2)
-; RV64I-NEXT:    srli a3, a1, 56
-; RV64I-NEXT:    srli a4, a1, 48
-; RV64I-NEXT:    srli a5, a1, 40
-; RV64I-NEXT:    srli a6, a1, 32
-; RV64I-NEXT:    sb a6, 4(a2)
-; RV64I-NEXT:    sb a5, 5(a2)
-; RV64I-NEXT:    sb a4, 6(a2)
-; RV64I-NEXT:    sb a3, 7(a2)
-; RV64I-NEXT:    srli a3, a1, 24
-; RV64I-NEXT:    srli a4, a1, 16
-; RV64I-NEXT:    srli a5, a1, 8
-; RV64I-NEXT:    sb a1, 0(a2)
-; RV64I-NEXT:    sb a5, 1(a2)
-; RV64I-NEXT:    sb a4, 2(a2)
-; RV64I-NEXT:    sb a3, 3(a2)
-; RV64I-NEXT:    srli a1, a0, 56
-; RV64I-NEXT:    srli a3, a0, 48
-; RV64I-NEXT:    srli a4, a0, 40
-; RV64I-NEXT:    srli a5, a0, 32
-; RV64I-NEXT:    sb a5, 12(a2)
-; RV64I-NEXT:    sb a4, 13(a2)
-; RV64I-NEXT:    sb a3, 14(a2)
-; RV64I-NEXT:    sb a1, 15(a2)
-; RV64I-NEXT:    srli a1, a0, 24
+; RV64I-NEXT:    sb t2, 17(a2)
+; RV64I-NEXT:    sb t1, 18(a2)
+; RV64I-NEXT:    sb t0, 19(a2)
 ; RV64I-NEXT:    srli a3, a0, 16
-; RV64I-NEXT:    srli a4, a0, 8
+; RV64I-NEXT:    sb t6, 4(a2)
+; RV64I-NEXT:    sb t5, 5(a2)
+; RV64I-NEXT:    sb t4, 6(a2)
+; RV64I-NEXT:    sb t3, 7(a2)
+; RV64I-NEXT:    srli a5, a0, 8
+; RV64I-NEXT:    sb a1, 0(a2)
+; RV64I-NEXT:    sb s2, 1(a2)
+; RV64I-NEXT:    sb s1, 2(a2)
+; RV64I-NEXT:    sb s0, 3(a2)
+; RV64I-NEXT:    sb s6, 12(a2)
+; RV64I-NEXT:    sb s5, 13(a2)
+; RV64I-NEXT:    sb s4, 14(a2)
+; RV64I-NEXT:    sb s3, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    sb a4, 9(a2)
+; RV64I-NEXT:    sb a5, 9(a2)
 ; RV64I-NEXT:    sb a3, 10(a2)
-; RV64I-NEXT:    sb a1, 11(a2)
-; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    sb a4, 11(a2)
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_32bytes:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    addi sp, sp, -64
-; RV32I-NEXT:    lbu a3, 1(a0)
-; RV32I-NEXT:    lbu a4, 0(a0)
-; RV32I-NEXT:    lbu a5, 2(a0)
-; RV32I-NEXT:    lbu a6, 3(a0)
-; RV32I-NEXT:    slli a3, a3, 8
-; RV32I-NEXT:    or a3, a3, a4
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli a6, a6, 24
-; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    addi sp, sp, -128
+; RV32I-NEXT:    sw ra, 124(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 120(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 116(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 112(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 108(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 104(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s5, 100(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s6, 96(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s7, 92(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s8, 88(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s9, 84(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s10, 80(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s11, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lbu a3, 0(a0)
+; RV32I-NEXT:    lbu a4, 1(a0)
+; RV32I-NEXT:    lbu a6, 2(a0)
+; RV32I-NEXT:    lbu a7, 3(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
-; RV32I-NEXT:    lbu a6, 5(a0)
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    lbu a4, 6(a0)
-; RV32I-NEXT:    lbu a7, 7(a0)
-; RV32I-NEXT:    slli a6, a6, 8
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    slli a4, a4, 16
+; RV32I-NEXT:    lbu t0, 5(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
+; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
-; RV32I-NEXT:    or a4, a7, a4
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    lbu a5, 10(a0)
-; RV32I-NEXT:    lbu t0, 11(a0)
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    slli a5, a5, 16
-; RV32I-NEXT:    slli t0, t0, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    lbu a7, 12(a0)
-; RV32I-NEXT:    lbu t0, 13(a0)
-; RV32I-NEXT:    or a5, a5, a6
-; RV32I-NEXT:    lbu a6, 14(a0)
-; RV32I-NEXT:    lbu t1, 15(a0)
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    or a4, a7, a6
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a7, t0, a7
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t1, t1, 24
-; RV32I-NEXT:    or a6, t1, a6
-; RV32I-NEXT:    lbu t0, 16(a0)
-; RV32I-NEXT:    lbu t1, 17(a0)
-; RV32I-NEXT:    or a7, a6, a7
-; RV32I-NEXT:    lbu a6, 18(a0)
-; RV32I-NEXT:    lbu t2, 19(a0)
-; RV32I-NEXT:    slli t1, t1, 8
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    or a6, t2, a6
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    or t0, a6, t0
-; RV32I-NEXT:    lbu a6, 22(a0)
-; RV32I-NEXT:    lbu t3, 23(a0)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or a6, t3, a6
-; RV32I-NEXT:    lbu t2, 24(a0)
-; RV32I-NEXT:    lbu t3, 25(a0)
-; RV32I-NEXT:    or t1, a6, t1
-; RV32I-NEXT:    lbu a6, 26(a0)
-; RV32I-NEXT:    lbu t4, 27(a0)
-; RV32I-NEXT:    slli t3, t3, 8
-; RV32I-NEXT:    or t2, t3, t2
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a6, t4, a6
-; RV32I-NEXT:    lbu t3, 28(a0)
-; RV32I-NEXT:    lbu t4, 29(a0)
-; RV32I-NEXT:    or t2, a6, t2
-; RV32I-NEXT:    lbu a6, 30(a0)
-; RV32I-NEXT:    lbu a0, 31(a0)
 ; RV32I-NEXT:    slli t4, t4, 8
-; RV32I-NEXT:    or t3, t4, t3
-; RV32I-NEXT:    slli a6, a6, 16
-; RV32I-NEXT:    slli a0, a0, 24
-; RV32I-NEXT:    or a6, a0, a6
-; RV32I-NEXT:    lbu t4, 0(a1)
-; RV32I-NEXT:    lbu t5, 1(a1)
-; RV32I-NEXT:    or t3, a6, t3
-; RV32I-NEXT:    lbu a6, 2(a1)
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    lbu ra, 24(a0)
+; RV32I-NEXT:    lbu a3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
+; RV32I-NEXT:    slli s1, s1, 8
+; RV32I-NEXT:    slli s2, s2, 16
+; RV32I-NEXT:    slli s3, s3, 24
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    or t1, s1, s0
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    or t3, s5, s4
+; RV32I-NEXT:    lbu t6, 28(a0)
+; RV32I-NEXT:    lbu s0, 29(a0)
+; RV32I-NEXT:    lbu s1, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    or s2, s7, s6
+; RV32I-NEXT:    or s3, s9, s8
+; RV32I-NEXT:    or s4, s11, s10
+; RV32I-NEXT:    lbu s5, 0(a1)
+; RV32I-NEXT:    lbu s6, 1(a1)
+; RV32I-NEXT:    lbu s7, 2(a1)
 ; RV32I-NEXT:    lbu a1, 3(a1)
-; RV32I-NEXT:    slli t5, t5, 8
-; RV32I-NEXT:    or t4, t5, t4
-; RV32I-NEXT:    slli a6, a6, 16
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, ra
+; RV32I-NEXT:    addi s8, sp, 8
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli s7, s7, 16
 ; RV32I-NEXT:    slli a1, a1, 24
-; RV32I-NEXT:    or a1, a1, a6
-; RV32I-NEXT:    or a6, a1, t4
-; RV32I-NEXT:    srai a0, a0, 31
-; RV32I-NEXT:    sw a0, 48(sp)
-; RV32I-NEXT:    sw a0, 52(sp)
-; RV32I-NEXT:    sw a0, 56(sp)
-; RV32I-NEXT:    sw a0, 60(sp)
-; RV32I-NEXT:    sw a0, 32(sp)
-; RV32I-NEXT:    sw a0, 36(sp)
-; RV32I-NEXT:    sw a0, 40(sp)
-; RV32I-NEXT:    sw a0, 44(sp)
-; RV32I-NEXT:    sw t0, 16(sp)
-; RV32I-NEXT:    sw t1, 20(sp)
-; RV32I-NEXT:    sw t2, 24(sp)
-; RV32I-NEXT:    sw t3, 28(sp)
-; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a4, 4(sp)
-; RV32I-NEXT:    sw a5, 8(sp)
-; RV32I-NEXT:    sw a7, 12(sp)
-; RV32I-NEXT:    srli a0, a6, 3
-; RV32I-NEXT:    andi a0, a0, 28
-; RV32I-NEXT:    mv a1, sp
-; RV32I-NEXT:    add a3, a1, a0
-; RV32I-NEXT:    lw a1, 4(a3)
-; RV32I-NEXT:    lw a4, 0(a3)
-; RV32I-NEXT:    lw a5, 8(a3)
-; RV32I-NEXT:    lw a7, 12(a3)
-; RV32I-NEXT:    srl a0, a1, a6
-; RV32I-NEXT:    andi t0, a6, 31
-; RV32I-NEXT:    xori t0, t0, 31
-; RV32I-NEXT:    slli t1, a5, 1
-; RV32I-NEXT:    sll t1, t1, t0
-; RV32I-NEXT:    or a0, a0, t1
-; RV32I-NEXT:    srl a4, a4, a6
-; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    sll a1, a1, t0
-; RV32I-NEXT:    or a1, a4, a1
-; RV32I-NEXT:    srl a4, a7, a6
-; RV32I-NEXT:    lw t1, 16(a3)
-; RV32I-NEXT:    lw t2, 20(a3)
-; RV32I-NEXT:    lw t3, 24(a3)
-; RV32I-NEXT:    lw t4, 28(a3)
-; RV32I-NEXT:    slli a3, t1, 1
-; RV32I-NEXT:    sll a3, a3, t0
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    srl a4, a5, a6
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t5, s0, t6
+; RV32I-NEXT:    or s1, a0, s1
+; RV32I-NEXT:    or t6, s6, s5
+; RV32I-NEXT:    or a1, a1, s7
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, a0
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or t0, s2, t3
+; RV32I-NEXT:    or t1, s4, s3
+; RV32I-NEXT:    or a3, t4, a3
+; RV32I-NEXT:    or t2, s1, t5
+; RV32I-NEXT:    or a0, a1, t6
+; RV32I-NEXT:    sw s0, 56(sp)
+; RV32I-NEXT:    sw s0, 60(sp)
+; RV32I-NEXT:    sw s0, 64(sp)
+; RV32I-NEXT:    sw s0, 68(sp)
+; RV32I-NEXT:    sw s0, 40(sp)
+; RV32I-NEXT:    sw s0, 44(sp)
+; RV32I-NEXT:    sw s0, 48(sp)
+; RV32I-NEXT:    sw s0, 52(sp)
+; RV32I-NEXT:    sw t0, 24(sp)
+; RV32I-NEXT:    sw t1, 28(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
+; RV32I-NEXT:    sw t2, 36(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    sw a7, 20(sp)
+; RV32I-NEXT:    srli a1, a0, 3
+; RV32I-NEXT:    andi a3, a0, 31
+; RV32I-NEXT:    andi a4, a1, 28
+; RV32I-NEXT:    xori a1, a3, 31
+; RV32I-NEXT:    add a4, s8, a4
+; RV32I-NEXT:    lw a3, 0(a4)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a6, 8(a4)
+; RV32I-NEXT:    lw a7, 12(a4)
+; RV32I-NEXT:    lw t0, 16(a4)
+; RV32I-NEXT:    lw t1, 20(a4)
+; RV32I-NEXT:    lw t2, 24(a4)
+; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    srl t3, a5, a0
+; RV32I-NEXT:    slli t4, a6, 1
+; RV32I-NEXT:    srl a3, a3, a0
+; RV32I-NEXT:    slli a5, a5, 1
+; RV32I-NEXT:    srl t5, a7, a0
+; RV32I-NEXT:    slli t6, t0, 1
+; RV32I-NEXT:    srl a6, a6, a0
 ; RV32I-NEXT:    slli a7, a7, 1
-; RV32I-NEXT:    sll a5, a7, t0
-; RV32I-NEXT:    or a4, a4, a5
-; RV32I-NEXT:    srl a5, t2, a6
-; RV32I-NEXT:    slli a7, t3, 1
-; RV32I-NEXT:    sll a7, a7, t0
-; RV32I-NEXT:    or a5, a5, a7
-; RV32I-NEXT:    srl a7, t1, a6
-; RV32I-NEXT:    slli t2, t2, 1
-; RV32I-NEXT:    sll t1, t2, t0
-; RV32I-NEXT:    or a7, a7, t1
-; RV32I-NEXT:    srl t1, t3, a6
-; RV32I-NEXT:    slli t2, t4, 1
-; RV32I-NEXT:    sll t0, t2, t0
-; RV32I-NEXT:    or t0, t1, t0
-; RV32I-NEXT:    sra a6, t4, a6
-; RV32I-NEXT:    srli t1, a6, 24
-; RV32I-NEXT:    srli t2, a6, 16
-; RV32I-NEXT:    srli t3, a6, 8
-; RV32I-NEXT:    sb a6, 28(a2)
-; RV32I-NEXT:    sb t3, 29(a2)
-; RV32I-NEXT:    sb t2, 30(a2)
-; RV32I-NEXT:    sb t1, 31(a2)
-; RV32I-NEXT:    srli a6, t0, 24
-; RV32I-NEXT:    srli t1, t0, 16
-; RV32I-NEXT:    srli t2, t0, 8
-; RV32I-NEXT:    sb t0, 24(a2)
+; RV32I-NEXT:    srl s0, t1, a0
+; RV32I-NEXT:    slli s1, t2, 1
+; RV32I-NEXT:    srl t0, t0, a0
+; RV32I-NEXT:    slli t1, t1, 1
+; RV32I-NEXT:    srl t2, t2, a0
+; RV32I-NEXT:    slli s2, a4, 1
+; RV32I-NEXT:    sra s3, a4, a0
+; RV32I-NEXT:    sll a0, t4, a1
+; RV32I-NEXT:    sll a4, a5, a1
+; RV32I-NEXT:    sll a5, t6, a1
+; RV32I-NEXT:    sll a7, a7, a1
+; RV32I-NEXT:    sll t4, s1, a1
+; RV32I-NEXT:    sll t1, t1, a1
+; RV32I-NEXT:    sll t6, s2, a1
+; RV32I-NEXT:    srli s1, s3, 24
+; RV32I-NEXT:    srli s2, s3, 16
+; RV32I-NEXT:    srli s4, s3, 8
+; RV32I-NEXT:    or a0, t3, a0
+; RV32I-NEXT:    or a1, a3, a4
+; RV32I-NEXT:    or a3, t5, a5
+; RV32I-NEXT:    or a4, a6, a7
+; RV32I-NEXT:    or a5, s0, t4
+; RV32I-NEXT:    or a6, t0, t1
+; RV32I-NEXT:    or a7, t2, t6
+; RV32I-NEXT:    sb s3, 28(a2)
+; RV32I-NEXT:    sb s4, 29(a2)
+; RV32I-NEXT:    sb s2, 30(a2)
+; RV32I-NEXT:    sb s1, 31(a2)
+; RV32I-NEXT:    srli t0, a7, 24
+; RV32I-NEXT:    srli t1, a7, 16
+; RV32I-NEXT:    srli t2, a7, 8
+; RV32I-NEXT:    srli t3, a6, 24
+; RV32I-NEXT:    srli t4, a6, 16
+; RV32I-NEXT:    srli t5, a6, 8
+; RV32I-NEXT:    srli t6, a5, 24
+; RV32I-NEXT:    srli s0, a5, 16
+; RV32I-NEXT:    srli s1, a5, 8
+; RV32I-NEXT:    srli s2, a4, 24
+; RV32I-NEXT:    srli s3, a4, 16
+; RV32I-NEXT:    srli s4, a4, 8
+; RV32I-NEXT:    srli s5, a3, 24
+; RV32I-NEXT:    srli s6, a3, 16
+; RV32I-NEXT:    srli s7, a3, 8
+; RV32I-NEXT:    srli s8, a1, 24
+; RV32I-NEXT:    srli s9, a1, 16
+; RV32I-NEXT:    sb a7, 24(a2)
 ; RV32I-NEXT:    sb t2, 25(a2)
 ; RV32I-NEXT:    sb t1, 26(a2)
-; RV32I-NEXT:    sb a6, 27(a2)
-; RV32I-NEXT:    srli a6, a7, 24
-; RV32I-NEXT:    srli t0, a7, 16
-; RV32I-NEXT:    srli t1, a7, 8
-; RV32I-NEXT:    sb a7, 16(a2)
-; RV32I-NEXT:    sb t1, 17(a2)
-; RV32I-NEXT:    sb t0, 18(a2)
-; RV32I-NEXT:    sb a6, 19(a2)
-; RV32I-NEXT:    srli a6, a5, 24
-; RV32I-NEXT:    srli a7, a5, 16
-; RV32I-NEXT:    srli t0, a5, 8
+; RV32I-NEXT:    sb t0, 27(a2)
+; RV32I-NEXT:    srli a7, a1, 8
+; RV32I-NEXT:    sb a6, 16(a2)
+; RV32I-NEXT:    sb t5, 17(a2)
+; RV32I-NEXT:    sb t4, 18(a2)
+; RV32I-NEXT:    sb t3, 19(a2)
+; RV32I-NEXT:    srli a6, a0, 24
 ; RV32I-NEXT:    sb a5, 20(a2)
-; RV32I-NEXT:    sb t0, 21(a2)
-; RV32I-NEXT:    sb a7, 22(a2)
-; RV32I-NEXT:    sb a6, 23(a2)
-; RV32I-NEXT:    srli a5, a4, 24
-; RV32I-NEXT:    srli a6, a4, 16
-; RV32I-NEXT:    srli a7, a4, 8
+; RV32I-NEXT:    sb s1, 21(a2)
+; RV32I-NEXT:    sb s0, 22(a2)
+; RV32I-NEXT:    sb t6, 23(a2)
+; RV32I-NEXT:    srli a5, a0, 16
 ; RV32I-NEXT:    sb a4, 8(a2)
-; RV32I-NEXT:    sb a7, 9(a2)
-; RV32I-NEXT:    sb a6, 10(a2)
-; RV32I-NEXT:    sb a5, 11(a2)
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    srli a5, a3, 16
-; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    sb s4, 9(a2)
+; RV32I-NEXT:    sb s3, 10(a2)
+; RV32I-NEXT:    sb s2, 11(a2)
+; RV32I-NEXT:    srli a4, a0, 8
 ; RV32I-NEXT:    sb a3, 12(a2)
-; RV32I-NEXT:    sb a6, 13(a2)
-; RV32I-NEXT:    sb a5, 14(a2)
-; RV32I-NEXT:    sb a4, 15(a2)
-; RV32I-NEXT:    srli a3, a1, 24
-; RV32I-NEXT:    srli a4, a1, 16
-; RV32I-NEXT:    srli a5, a1, 8
+; RV32I-NEXT:    sb s7, 13(a2)
+; RV32I-NEXT:    sb s6, 14(a2)
+; RV32I-NEXT:    sb s5, 15(a2)
 ; RV32I-NEXT:    sb a1, 0(a2)
-; RV32I-NEXT:    sb a5, 1(a2)
-; RV32I-NEXT:    sb a4, 2(a2)
-; RV32I-NEXT:    sb a3, 3(a2)
-; RV32I-NEXT:    srli a1, a0, 24
-; RV32I-NEXT:    srli a3, a0, 16
-; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    sb a7, 1(a2)
+; RV32I-NEXT:    sb s9, 2(a2)
+; RV32I-NEXT:    sb s8, 3(a2)
 ; RV32I-NEXT:    sb a0, 4(a2)
 ; RV32I-NEXT:    sb a4, 5(a2)
-; RV32I-NEXT:    sb a3, 6(a2)
-; RV32I-NEXT:    sb a1, 7(a2)
-; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    sb a5, 6(a2)
+; RV32I-NEXT:    sb a6, 7(a2)
+; RV32I-NEXT:    lw ra, 124(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 120(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 116(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 112(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 108(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 104(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s5, 100(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s6, 96(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s7, 92(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s8, 88(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s9, 84(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s10, 80(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s11, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 128
 ; RV32I-NEXT:    ret
   %src = load i256, ptr %src.ptr, align 1
   %bitOff = load i256, ptr %bitOff.ptr, align 1

diff  --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index 572b74cc2499f7..c0cbbb3ff9389e 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -247,13 +247,13 @@ define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    add a5, a1, a3
 ; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    xor a3, a1, a3
 ; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    not a3, a3
 ; RV32-NEXT:    add a5, a5, a0
-; RV32-NEXT:    xor a0, a1, a5
-; RV32-NEXT:    xor a1, a1, a3
-; RV32-NEXT:    not a1, a1
-; RV32-NEXT:    and a0, a1, a0
-; RV32-NEXT:    slti a0, a0, 0
+; RV32-NEXT:    xor a1, a1, a5
+; RV32-NEXT:    and a1, a3, a1
+; RV32-NEXT:    slti a0, a1, 0
 ; RV32-NEXT:    sw a2, 0(a4)
 ; RV32-NEXT:    sw a5, 4(a4)
 ; RV32-NEXT:    ret
@@ -271,13 +271,13 @@ define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    add a5, a1, a3
 ; RV32ZBA-NEXT:    add a2, a0, a2
+; RV32ZBA-NEXT:    xor a3, a1, a3
 ; RV32ZBA-NEXT:    sltu a0, a2, a0
+; RV32ZBA-NEXT:    not a3, a3
 ; RV32ZBA-NEXT:    add a5, a5, a0
-; RV32ZBA-NEXT:    xor a0, a1, a5
-; RV32ZBA-NEXT:    xor a1, a1, a3
-; RV32ZBA-NEXT:    not a1, a1
-; RV32ZBA-NEXT:    and a0, a1, a0
-; RV32ZBA-NEXT:    slti a0, a0, 0
+; RV32ZBA-NEXT:    xor a1, a1, a5
+; RV32ZBA-NEXT:    and a1, a3, a1
+; RV32ZBA-NEXT:    slti a0, a1, 0
 ; RV32ZBA-NEXT:    sw a2, 0(a4)
 ; RV32ZBA-NEXT:    sw a5, 4(a4)
 ; RV32ZBA-NEXT:    ret
@@ -295,13 +295,13 @@ define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    add a5, a1, a3
 ; RV32ZICOND-NEXT:    add a2, a0, a2
+; RV32ZICOND-NEXT:    xor a3, a1, a3
 ; RV32ZICOND-NEXT:    sltu a0, a2, a0
+; RV32ZICOND-NEXT:    not a3, a3
 ; RV32ZICOND-NEXT:    add a5, a5, a0
-; RV32ZICOND-NEXT:    xor a0, a1, a5
-; RV32ZICOND-NEXT:    xor a1, a1, a3
-; RV32ZICOND-NEXT:    not a1, a1
-; RV32ZICOND-NEXT:    and a0, a1, a0
-; RV32ZICOND-NEXT:    slti a0, a0, 0
+; RV32ZICOND-NEXT:    xor a1, a1, a5
+; RV32ZICOND-NEXT:    and a1, a3, a1
+; RV32ZICOND-NEXT:    slti a0, a1, 0
 ; RV32ZICOND-NEXT:    sw a2, 0(a4)
 ; RV32ZICOND-NEXT:    sw a5, 4(a4)
 ; RV32ZICOND-NEXT:    ret
@@ -326,14 +326,14 @@ define zeroext i1 @saddo2.i64(i64 %v1, ptr %res) {
 ; RV32-LABEL: saddo2.i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    addi a3, a0, 4
+; RV32-NEXT:    not a4, a1
 ; RV32-NEXT:    sltu a0, a3, a0
-; RV32-NEXT:    add a4, a1, a0
-; RV32-NEXT:    xor a0, a1, a4
-; RV32-NEXT:    not a1, a1
-; RV32-NEXT:    and a0, a1, a0
-; RV32-NEXT:    slti a0, a0, 0
+; RV32-NEXT:    add a5, a1, a0
+; RV32-NEXT:    xor a1, a1, a5
+; RV32-NEXT:    and a1, a4, a1
+; RV32-NEXT:    slti a0, a1, 0
 ; RV32-NEXT:    sw a3, 0(a2)
-; RV32-NEXT:    sw a4, 4(a2)
+; RV32-NEXT:    sw a5, 4(a2)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: saddo2.i64:
@@ -346,14 +346,14 @@ define zeroext i1 @saddo2.i64(i64 %v1, ptr %res) {
 ; RV32ZBA-LABEL: saddo2.i64:
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    addi a3, a0, 4
+; RV32ZBA-NEXT:    not a4, a1
 ; RV32ZBA-NEXT:    sltu a0, a3, a0
-; RV32ZBA-NEXT:    add a4, a1, a0
-; RV32ZBA-NEXT:    xor a0, a1, a4
-; RV32ZBA-NEXT:    not a1, a1
-; RV32ZBA-NEXT:    and a0, a1, a0
-; RV32ZBA-NEXT:    slti a0, a0, 0
+; RV32ZBA-NEXT:    add a5, a1, a0
+; RV32ZBA-NEXT:    xor a1, a1, a5
+; RV32ZBA-NEXT:    and a1, a4, a1
+; RV32ZBA-NEXT:    slti a0, a1, 0
 ; RV32ZBA-NEXT:    sw a3, 0(a2)
-; RV32ZBA-NEXT:    sw a4, 4(a2)
+; RV32ZBA-NEXT:    sw a5, 4(a2)
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: saddo2.i64:
@@ -366,14 +366,14 @@ define zeroext i1 @saddo2.i64(i64 %v1, ptr %res) {
 ; RV32ZICOND-LABEL: saddo2.i64:
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    addi a3, a0, 4
+; RV32ZICOND-NEXT:    not a4, a1
 ; RV32ZICOND-NEXT:    sltu a0, a3, a0
-; RV32ZICOND-NEXT:    add a4, a1, a0
-; RV32ZICOND-NEXT:    xor a0, a1, a4
-; RV32ZICOND-NEXT:    not a1, a1
-; RV32ZICOND-NEXT:    and a0, a1, a0
-; RV32ZICOND-NEXT:    slti a0, a0, 0
+; RV32ZICOND-NEXT:    add a5, a1, a0
+; RV32ZICOND-NEXT:    xor a1, a1, a5
+; RV32ZICOND-NEXT:    and a1, a4, a1
+; RV32ZICOND-NEXT:    slti a0, a1, 0
 ; RV32ZICOND-NEXT:    sw a3, 0(a2)
-; RV32ZICOND-NEXT:    sw a4, 4(a2)
+; RV32ZICOND-NEXT:    sw a5, 4(a2)
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: saddo2.i64:
@@ -862,15 +862,14 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    sltu a5, a0, a2
 ; RV32-NEXT:    sub a6, a1, a3
+; RV32-NEXT:    xor a3, a1, a3
+; RV32-NEXT:    sub a2, a0, a2
 ; RV32-NEXT:    sub a5, a6, a5
-; RV32-NEXT:    xor a6, a1, a5
-; RV32-NEXT:    xor a1, a1, a3
-; RV32-NEXT:    and a1, a1, a6
-; RV32-NEXT:    slti a1, a1, 0
-; RV32-NEXT:    sub a0, a0, a2
-; RV32-NEXT:    sw a0, 0(a4)
+; RV32-NEXT:    xor a1, a1, a5
+; RV32-NEXT:    and a1, a3, a1
+; RV32-NEXT:    slti a0, a1, 0
+; RV32-NEXT:    sw a2, 0(a4)
 ; RV32-NEXT:    sw a5, 4(a4)
-; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ssubo.i64:
@@ -886,15 +885,14 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    sltu a5, a0, a2
 ; RV32ZBA-NEXT:    sub a6, a1, a3
+; RV32ZBA-NEXT:    xor a3, a1, a3
+; RV32ZBA-NEXT:    sub a2, a0, a2
 ; RV32ZBA-NEXT:    sub a5, a6, a5
-; RV32ZBA-NEXT:    xor a6, a1, a5
-; RV32ZBA-NEXT:    xor a1, a1, a3
-; RV32ZBA-NEXT:    and a1, a1, a6
-; RV32ZBA-NEXT:    slti a1, a1, 0
-; RV32ZBA-NEXT:    sub a0, a0, a2
-; RV32ZBA-NEXT:    sw a0, 0(a4)
+; RV32ZBA-NEXT:    xor a1, a1, a5
+; RV32ZBA-NEXT:    and a1, a3, a1
+; RV32ZBA-NEXT:    slti a0, a1, 0
+; RV32ZBA-NEXT:    sw a2, 0(a4)
 ; RV32ZBA-NEXT:    sw a5, 4(a4)
-; RV32ZBA-NEXT:    mv a0, a1
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: ssubo.i64:
@@ -910,15 +908,14 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    sltu a5, a0, a2
 ; RV32ZICOND-NEXT:    sub a6, a1, a3
+; RV32ZICOND-NEXT:    xor a3, a1, a3
+; RV32ZICOND-NEXT:    sub a2, a0, a2
 ; RV32ZICOND-NEXT:    sub a5, a6, a5
-; RV32ZICOND-NEXT:    xor a6, a1, a5
-; RV32ZICOND-NEXT:    xor a1, a1, a3
-; RV32ZICOND-NEXT:    and a1, a1, a6
-; RV32ZICOND-NEXT:    slti a1, a1, 0
-; RV32ZICOND-NEXT:    sub a0, a0, a2
-; RV32ZICOND-NEXT:    sw a0, 0(a4)
+; RV32ZICOND-NEXT:    xor a1, a1, a5
+; RV32ZICOND-NEXT:    and a1, a3, a1
+; RV32ZICOND-NEXT:    slti a0, a1, 0
+; RV32ZICOND-NEXT:    sw a2, 0(a4)
 ; RV32ZICOND-NEXT:    sw a5, 4(a4)
-; RV32ZICOND-NEXT:    mv a0, a1
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: ssubo.i64:
@@ -1152,12 +1149,12 @@ define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    sltu a5, a0, a2
 ; RV32ZICOND-NEXT:    sub a3, a1, a3
+; RV32ZICOND-NEXT:    sub a2, a0, a2
 ; RV32ZICOND-NEXT:    sub a3, a3, a5
+; RV32ZICOND-NEXT:    sltu a0, a0, a2
 ; RV32ZICOND-NEXT:    xor a5, a3, a1
 ; RV32ZICOND-NEXT:    sltu a1, a1, a3
 ; RV32ZICOND-NEXT:    czero.eqz a1, a1, a5
-; RV32ZICOND-NEXT:    sub a2, a0, a2
-; RV32ZICOND-NEXT:    sltu a0, a0, a2
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, a5
 ; RV32ZICOND-NEXT:    or a0, a0, a1
 ; RV32ZICOND-NEXT:    sw a2, 0(a4)
@@ -1268,8 +1265,8 @@ define zeroext i1 @smulo2.i32(i32 signext %v1, ptr %res) {
 ; RV32ZBA-LABEL: smulo2.i32:
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    li a2, 13
-; RV32ZBA-NEXT:    mulh a2, a0, a2
 ; RV32ZBA-NEXT:    sh1add a3, a0, a0
+; RV32ZBA-NEXT:    mulh a2, a0, a2
 ; RV32ZBA-NEXT:    sh2add a3, a3, a0
 ; RV32ZBA-NEXT:    srai a0, a3, 31
 ; RV32ZBA-NEXT:    xor a0, a2, a0
@@ -1324,54 +1321,53 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    .cfi_offset s0, -4
 ; RV32-NEXT:    .cfi_offset s1, -8
-; RV32-NEXT:    mulhu a5, a0, a2
-; RV32-NEXT:    mul a6, a1, a2
-; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    sltu a6, a5, a6
-; RV32-NEXT:    mulhu a7, a1, a2
-; RV32-NEXT:    add a6, a7, a6
-; RV32-NEXT:    mul a7, a0, a3
-; RV32-NEXT:    add a5, a7, a5
-; RV32-NEXT:    sltu a7, a5, a7
-; RV32-NEXT:    mulhu t0, a0, a3
-; RV32-NEXT:    add a7, t0, a7
-; RV32-NEXT:    add a7, a6, a7
-; RV32-NEXT:    mul t0, a1, a3
-; RV32-NEXT:    add t1, t0, a7
-; RV32-NEXT:    srai t2, a1, 31
-; RV32-NEXT:    mul t3, a2, t2
+; RV32-NEXT:    mulhu a6, a0, a2
+; RV32-NEXT:    mul a7, a1, a2
+; RV32-NEXT:    mulhu t0, a1, a2
+; RV32-NEXT:    mul t1, a0, a3
+; RV32-NEXT:    mulhu t2, a0, a3
+; RV32-NEXT:    mul a5, a1, a3
+; RV32-NEXT:    srai t3, a1, 31
 ; RV32-NEXT:    srai t4, a3, 31
-; RV32-NEXT:    mul t5, t4, a0
-; RV32-NEXT:    add t6, t5, t3
-; RV32-NEXT:    add s0, t1, t6
-; RV32-NEXT:    sltu s1, s0, t1
-; RV32-NEXT:    sltu t0, t1, t0
-; RV32-NEXT:    sltu a6, a7, a6
-; RV32-NEXT:    mulhu a7, a1, a3
+; RV32-NEXT:    mulhu t5, a1, a3
+; RV32-NEXT:    mul t6, a0, a2
 ; RV32-NEXT:    add a6, a7, a6
-; RV32-NEXT:    add a6, a6, t0
-; RV32-NEXT:    mulhu a7, a2, t2
-; RV32-NEXT:    add a7, a7, t3
-; RV32-NEXT:    mul a3, a3, t2
-; RV32-NEXT:    add a3, a7, a3
+; RV32-NEXT:    mul s0, a2, t3
+; RV32-NEXT:    mul s1, t4, a0
+; RV32-NEXT:    mulhu a2, a2, t3
+; RV32-NEXT:    mul a3, a3, t3
 ; RV32-NEXT:    mul a1, t4, a1
-; RV32-NEXT:    mulhu a7, t4, a0
+; RV32-NEXT:    mulhu a0, t4, a0
+; RV32-NEXT:    sltu a7, a6, a7
+; RV32-NEXT:    add a6, t1, a6
+; RV32-NEXT:    add t3, s1, s0
+; RV32-NEXT:    add a2, a2, s0
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a7, t0, a7
+; RV32-NEXT:    sltu a1, a6, t1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a0, a0, s1
+; RV32-NEXT:    sltu a3, t3, s1
+; RV32-NEXT:    srai t0, a6, 31
+; RV32-NEXT:    add a1, t2, a1
+; RV32-NEXT:    add a0, a0, a2
 ; RV32-NEXT:    add a1, a7, a1
-; RV32-NEXT:    add a1, a1, t5
-; RV32-NEXT:    add a1, a1, a3
-; RV32-NEXT:    sltu a3, t6, t5
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    add a2, a5, a1
+; RV32-NEXT:    sltu a1, a1, a7
+; RV32-NEXT:    add t3, a2, t3
+; RV32-NEXT:    sltu a3, a2, a5
+; RV32-NEXT:    add a1, t5, a1
+; RV32-NEXT:    sltu a2, t3, a2
 ; RV32-NEXT:    add a1, a1, a3
-; RV32-NEXT:    add a1, a6, a1
-; RV32-NEXT:    add a1, a1, s1
-; RV32-NEXT:    srai a3, a5, 31
-; RV32-NEXT:    xor a1, a1, a3
-; RV32-NEXT:    xor a3, s0, a3
-; RV32-NEXT:    or a1, a3, a1
-; RV32-NEXT:    snez a1, a1
-; RV32-NEXT:    mul a0, a0, a2
-; RV32-NEXT:    sw a0, 0(a4)
-; RV32-NEXT:    sw a5, 4(a4)
-; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    xor a3, t3, t0
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    xor a0, a0, t0
+; RV32-NEXT:    or a0, a3, a0
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    sw t6, 0(a4)
+; RV32-NEXT:    sw a6, 4(a4)
 ; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore s0
@@ -1398,54 +1394,53 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZBA-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; RV32ZBA-NEXT:    .cfi_offset s0, -4
 ; RV32ZBA-NEXT:    .cfi_offset s1, -8
-; RV32ZBA-NEXT:    mulhu a5, a0, a2
-; RV32ZBA-NEXT:    mul a6, a1, a2
-; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    sltu a6, a5, a6
-; RV32ZBA-NEXT:    mulhu a7, a1, a2
-; RV32ZBA-NEXT:    add a6, a7, a6
-; RV32ZBA-NEXT:    mul a7, a0, a3
-; RV32ZBA-NEXT:    add a5, a7, a5
-; RV32ZBA-NEXT:    sltu a7, a5, a7
-; RV32ZBA-NEXT:    mulhu t0, a0, a3
-; RV32ZBA-NEXT:    add a7, t0, a7
-; RV32ZBA-NEXT:    add a7, a6, a7
-; RV32ZBA-NEXT:    mul t0, a1, a3
-; RV32ZBA-NEXT:    add t1, t0, a7
-; RV32ZBA-NEXT:    srai t2, a1, 31
-; RV32ZBA-NEXT:    mul t3, a2, t2
+; RV32ZBA-NEXT:    mulhu a6, a0, a2
+; RV32ZBA-NEXT:    mul a7, a1, a2
+; RV32ZBA-NEXT:    mulhu t0, a1, a2
+; RV32ZBA-NEXT:    mul t1, a0, a3
+; RV32ZBA-NEXT:    mulhu t2, a0, a3
+; RV32ZBA-NEXT:    mul a5, a1, a3
+; RV32ZBA-NEXT:    srai t3, a1, 31
 ; RV32ZBA-NEXT:    srai t4, a3, 31
-; RV32ZBA-NEXT:    mul t5, t4, a0
-; RV32ZBA-NEXT:    add t6, t5, t3
-; RV32ZBA-NEXT:    add s0, t1, t6
-; RV32ZBA-NEXT:    sltu s1, s0, t1
-; RV32ZBA-NEXT:    sltu t0, t1, t0
-; RV32ZBA-NEXT:    sltu a6, a7, a6
-; RV32ZBA-NEXT:    mulhu a7, a1, a3
+; RV32ZBA-NEXT:    mulhu t5, a1, a3
+; RV32ZBA-NEXT:    mul t6, a0, a2
 ; RV32ZBA-NEXT:    add a6, a7, a6
-; RV32ZBA-NEXT:    add a6, a6, t0
-; RV32ZBA-NEXT:    mulhu a7, a2, t2
-; RV32ZBA-NEXT:    add a7, a7, t3
-; RV32ZBA-NEXT:    mul a3, a3, t2
-; RV32ZBA-NEXT:    add a3, a7, a3
+; RV32ZBA-NEXT:    mul s0, a2, t3
+; RV32ZBA-NEXT:    mul s1, t4, a0
+; RV32ZBA-NEXT:    mulhu a2, a2, t3
+; RV32ZBA-NEXT:    mul a3, a3, t3
 ; RV32ZBA-NEXT:    mul a1, t4, a1
-; RV32ZBA-NEXT:    mulhu a7, t4, a0
+; RV32ZBA-NEXT:    mulhu a0, t4, a0
+; RV32ZBA-NEXT:    sltu a7, a6, a7
+; RV32ZBA-NEXT:    add a6, t1, a6
+; RV32ZBA-NEXT:    add t3, s1, s0
+; RV32ZBA-NEXT:    add a2, a2, s0
+; RV32ZBA-NEXT:    add a0, a0, a1
+; RV32ZBA-NEXT:    add a7, t0, a7
+; RV32ZBA-NEXT:    sltu a1, a6, t1
+; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    add a0, a0, s1
+; RV32ZBA-NEXT:    sltu a3, t3, s1
+; RV32ZBA-NEXT:    srai t0, a6, 31
+; RV32ZBA-NEXT:    add a1, t2, a1
+; RV32ZBA-NEXT:    add a0, a0, a2
 ; RV32ZBA-NEXT:    add a1, a7, a1
-; RV32ZBA-NEXT:    add a1, a1, t5
-; RV32ZBA-NEXT:    add a1, a1, a3
-; RV32ZBA-NEXT:    sltu a3, t6, t5
+; RV32ZBA-NEXT:    add a0, a0, a3
+; RV32ZBA-NEXT:    add a2, a5, a1
+; RV32ZBA-NEXT:    sltu a1, a1, a7
+; RV32ZBA-NEXT:    add t3, a2, t3
+; RV32ZBA-NEXT:    sltu a3, a2, a5
+; RV32ZBA-NEXT:    add a1, t5, a1
+; RV32ZBA-NEXT:    sltu a2, t3, a2
 ; RV32ZBA-NEXT:    add a1, a1, a3
-; RV32ZBA-NEXT:    add a1, a6, a1
-; RV32ZBA-NEXT:    add a1, a1, s1
-; RV32ZBA-NEXT:    srai a3, a5, 31
-; RV32ZBA-NEXT:    xor a1, a1, a3
-; RV32ZBA-NEXT:    xor a3, s0, a3
-; RV32ZBA-NEXT:    or a1, a3, a1
-; RV32ZBA-NEXT:    snez a1, a1
-; RV32ZBA-NEXT:    mul a0, a0, a2
-; RV32ZBA-NEXT:    sw a0, 0(a4)
-; RV32ZBA-NEXT:    sw a5, 4(a4)
-; RV32ZBA-NEXT:    mv a0, a1
+; RV32ZBA-NEXT:    xor a3, t3, t0
+; RV32ZBA-NEXT:    add a0, a1, a0
+; RV32ZBA-NEXT:    add a0, a0, a2
+; RV32ZBA-NEXT:    xor a0, a0, t0
+; RV32ZBA-NEXT:    or a0, a3, a0
+; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    sw t6, 0(a4)
+; RV32ZBA-NEXT:    sw a6, 4(a4)
 ; RV32ZBA-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZBA-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZBA-NEXT:    .cfi_restore s0
@@ -1472,54 +1467,53 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZICOND-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
 ; RV32ZICOND-NEXT:    .cfi_offset s0, -4
 ; RV32ZICOND-NEXT:    .cfi_offset s1, -8
-; RV32ZICOND-NEXT:    mulhu a5, a0, a2
-; RV32ZICOND-NEXT:    mul a6, a1, a2
-; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    sltu a6, a5, a6
-; RV32ZICOND-NEXT:    mulhu a7, a1, a2
-; RV32ZICOND-NEXT:    add a6, a7, a6
-; RV32ZICOND-NEXT:    mul a7, a0, a3
-; RV32ZICOND-NEXT:    add a5, a7, a5
-; RV32ZICOND-NEXT:    sltu a7, a5, a7
-; RV32ZICOND-NEXT:    mulhu t0, a0, a3
-; RV32ZICOND-NEXT:    add a7, t0, a7
-; RV32ZICOND-NEXT:    add a7, a6, a7
-; RV32ZICOND-NEXT:    mul t0, a1, a3
-; RV32ZICOND-NEXT:    add t1, t0, a7
-; RV32ZICOND-NEXT:    srai t2, a1, 31
-; RV32ZICOND-NEXT:    mul t3, a2, t2
+; RV32ZICOND-NEXT:    mulhu a6, a0, a2
+; RV32ZICOND-NEXT:    mul a7, a1, a2
+; RV32ZICOND-NEXT:    mulhu t0, a1, a2
+; RV32ZICOND-NEXT:    mul t1, a0, a3
+; RV32ZICOND-NEXT:    mulhu t2, a0, a3
+; RV32ZICOND-NEXT:    mul a5, a1, a3
+; RV32ZICOND-NEXT:    srai t3, a1, 31
 ; RV32ZICOND-NEXT:    srai t4, a3, 31
-; RV32ZICOND-NEXT:    mul t5, t4, a0
-; RV32ZICOND-NEXT:    add t6, t5, t3
-; RV32ZICOND-NEXT:    add s0, t1, t6
-; RV32ZICOND-NEXT:    sltu s1, s0, t1
-; RV32ZICOND-NEXT:    sltu t0, t1, t0
-; RV32ZICOND-NEXT:    sltu a6, a7, a6
-; RV32ZICOND-NEXT:    mulhu a7, a1, a3
+; RV32ZICOND-NEXT:    mulhu t5, a1, a3
+; RV32ZICOND-NEXT:    mul t6, a0, a2
 ; RV32ZICOND-NEXT:    add a6, a7, a6
-; RV32ZICOND-NEXT:    add a6, a6, t0
-; RV32ZICOND-NEXT:    mulhu a7, a2, t2
-; RV32ZICOND-NEXT:    add a7, a7, t3
-; RV32ZICOND-NEXT:    mul a3, a3, t2
-; RV32ZICOND-NEXT:    add a3, a7, a3
+; RV32ZICOND-NEXT:    mul s0, a2, t3
+; RV32ZICOND-NEXT:    mul s1, t4, a0
+; RV32ZICOND-NEXT:    mulhu a2, a2, t3
+; RV32ZICOND-NEXT:    mul a3, a3, t3
 ; RV32ZICOND-NEXT:    mul a1, t4, a1
-; RV32ZICOND-NEXT:    mulhu a7, t4, a0
+; RV32ZICOND-NEXT:    mulhu a0, t4, a0
+; RV32ZICOND-NEXT:    sltu a7, a6, a7
+; RV32ZICOND-NEXT:    add a6, t1, a6
+; RV32ZICOND-NEXT:    add t3, s1, s0
+; RV32ZICOND-NEXT:    add a2, a2, s0
+; RV32ZICOND-NEXT:    add a0, a0, a1
+; RV32ZICOND-NEXT:    add a7, t0, a7
+; RV32ZICOND-NEXT:    sltu a1, a6, t1
+; RV32ZICOND-NEXT:    add a2, a2, a3
+; RV32ZICOND-NEXT:    add a0, a0, s1
+; RV32ZICOND-NEXT:    sltu a3, t3, s1
+; RV32ZICOND-NEXT:    srai t0, a6, 31
+; RV32ZICOND-NEXT:    add a1, t2, a1
+; RV32ZICOND-NEXT:    add a0, a0, a2
 ; RV32ZICOND-NEXT:    add a1, a7, a1
-; RV32ZICOND-NEXT:    add a1, a1, t5
+; RV32ZICOND-NEXT:    add a0, a0, a3
+; RV32ZICOND-NEXT:    add a2, a5, a1
+; RV32ZICOND-NEXT:    sltu a1, a1, a7
+; RV32ZICOND-NEXT:    add t3, a2, t3
+; RV32ZICOND-NEXT:    sltu a3, a2, a5
+; RV32ZICOND-NEXT:    add a1, t5, a1
+; RV32ZICOND-NEXT:    sltu a2, t3, a2
 ; RV32ZICOND-NEXT:    add a1, a1, a3
-; RV32ZICOND-NEXT:    sltu a3, t6, t5
-; RV32ZICOND-NEXT:    add a1, a1, a3
-; RV32ZICOND-NEXT:    add a1, a6, a1
-; RV32ZICOND-NEXT:    add a1, a1, s1
-; RV32ZICOND-NEXT:    srai a3, a5, 31
-; RV32ZICOND-NEXT:    xor a1, a1, a3
-; RV32ZICOND-NEXT:    xor a3, s0, a3
-; RV32ZICOND-NEXT:    or a1, a3, a1
-; RV32ZICOND-NEXT:    snez a1, a1
-; RV32ZICOND-NEXT:    mul a0, a0, a2
-; RV32ZICOND-NEXT:    sw a0, 0(a4)
-; RV32ZICOND-NEXT:    sw a5, 4(a4)
-; RV32ZICOND-NEXT:    mv a0, a1
+; RV32ZICOND-NEXT:    xor a3, t3, t0
+; RV32ZICOND-NEXT:    add a0, a1, a0
+; RV32ZICOND-NEXT:    add a0, a0, a2
+; RV32ZICOND-NEXT:    xor a0, a0, t0
+; RV32ZICOND-NEXT:    or a0, a3, a0
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    sw t6, 0(a4)
+; RV32ZICOND-NEXT:    sw a6, 4(a4)
 ; RV32ZICOND-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZICOND-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
 ; RV32ZICOND-NEXT:    .cfi_restore s0
@@ -1549,27 +1543,26 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
 ; RV32-LABEL: smulo2.i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    li a3, 13
-; RV32-NEXT:    mulhu a4, a0, a3
-; RV32-NEXT:    mul a5, a1, a3
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    sltu a5, a4, a5
-; RV32-NEXT:    mulhu a6, a1, a3
-; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    srai a1, a1, 31
+; RV32-NEXT:    srai a4, a1, 31
+; RV32-NEXT:    mulhu a5, a0, a3
 ; RV32-NEXT:    mul a6, a1, a3
-; RV32-NEXT:    add a6, a5, a6
-; RV32-NEXT:    srai a7, a4, 31
-; RV32-NEXT:    xor t0, a6, a7
-; RV32-NEXT:    sltu a5, a6, a5
-; RV32-NEXT:    mulh a1, a1, a3
-; RV32-NEXT:    add a1, a1, a5
-; RV32-NEXT:    xor a1, a1, a7
-; RV32-NEXT:    or a1, t0, a1
-; RV32-NEXT:    snez a1, a1
-; RV32-NEXT:    mul a0, a0, a3
-; RV32-NEXT:    sw a0, 0(a2)
-; RV32-NEXT:    sw a4, 4(a2)
-; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    mulhu a1, a1, a3
+; RV32-NEXT:    mul a7, a4, a3
+; RV32-NEXT:    mulh a4, a4, a3
+; RV32-NEXT:    mul a3, a0, a3
+; RV32-NEXT:    add a5, a6, a5
+; RV32-NEXT:    sltu a0, a5, a6
+; RV32-NEXT:    srai a6, a5, 31
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    add a7, a0, a7
+; RV32-NEXT:    xor a1, a7, a6
+; RV32-NEXT:    sltu a0, a7, a0
+; RV32-NEXT:    add a0, a4, a0
+; RV32-NEXT:    xor a0, a0, a6
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    sw a3, 0(a2)
+; RV32-NEXT:    sw a5, 4(a2)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo2.i64:
@@ -1586,37 +1579,36 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
 ; RV32ZBA-LABEL: smulo2.i64:
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    li a3, 13
-; RV32ZBA-NEXT:    mulhu a4, a0, a3
-; RV32ZBA-NEXT:    sh1add a5, a1, a1
-; RV32ZBA-NEXT:    sh2add a5, a5, a1
-; RV32ZBA-NEXT:    add a4, a5, a4
-; RV32ZBA-NEXT:    sltu a5, a4, a5
-; RV32ZBA-NEXT:    mulhu a6, a1, a3
-; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    srai a1, a1, 31
-; RV32ZBA-NEXT:    sh1add a6, a1, a1
-; RV32ZBA-NEXT:    sh2add a6, a6, a1
-; RV32ZBA-NEXT:    add a6, a5, a6
-; RV32ZBA-NEXT:    srai a7, a4, 31
-; RV32ZBA-NEXT:    xor t0, a6, a7
-; RV32ZBA-NEXT:    sltu a5, a6, a5
-; RV32ZBA-NEXT:    mulh a1, a1, a3
-; RV32ZBA-NEXT:    add a1, a1, a5
-; RV32ZBA-NEXT:    xor a1, a1, a7
-; RV32ZBA-NEXT:    or a1, t0, a1
-; RV32ZBA-NEXT:    snez a1, a1
-; RV32ZBA-NEXT:    sh1add a3, a0, a0
-; RV32ZBA-NEXT:    sh2add a0, a3, a0
-; RV32ZBA-NEXT:    sw a0, 0(a2)
-; RV32ZBA-NEXT:    sw a4, 4(a2)
-; RV32ZBA-NEXT:    mv a0, a1
+; RV32ZBA-NEXT:    sh1add a4, a1, a1
+; RV32ZBA-NEXT:    srai a5, a1, 31
+; RV32ZBA-NEXT:    sh1add a6, a0, a0
+; RV32ZBA-NEXT:    mulhu a7, a0, a3
+; RV32ZBA-NEXT:    sh2add a4, a4, a1
+; RV32ZBA-NEXT:    mulhu a1, a1, a3
+; RV32ZBA-NEXT:    sh1add t0, a5, a5
+; RV32ZBA-NEXT:    mulh a3, a5, a3
+; RV32ZBA-NEXT:    sh2add a6, a6, a0
+; RV32ZBA-NEXT:    add a7, a4, a7
+; RV32ZBA-NEXT:    sh2add a0, t0, a5
+; RV32ZBA-NEXT:    sltu a4, a7, a4
+; RV32ZBA-NEXT:    srai a5, a7, 31
+; RV32ZBA-NEXT:    add a1, a1, a4
+; RV32ZBA-NEXT:    add a0, a1, a0
+; RV32ZBA-NEXT:    xor a4, a0, a5
+; RV32ZBA-NEXT:    sltu a0, a0, a1
+; RV32ZBA-NEXT:    add a0, a3, a0
+; RV32ZBA-NEXT:    xor a0, a0, a5
+; RV32ZBA-NEXT:    or a0, a4, a0
+; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    sw a6, 0(a2)
+; RV32ZBA-NEXT:    sw a7, 4(a2)
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo2.i64:
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    li a2, 13
-; RV64ZBA-NEXT:    mulh a2, a0, a2
 ; RV64ZBA-NEXT:    sh1add a3, a0, a0
+; RV64ZBA-NEXT:    mulh a2, a0, a2
 ; RV64ZBA-NEXT:    sh2add a3, a3, a0
 ; RV64ZBA-NEXT:    srai a0, a3, 63
 ; RV64ZBA-NEXT:    xor a0, a2, a0
@@ -1627,27 +1619,26 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
 ; RV32ZICOND-LABEL: smulo2.i64:
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    li a3, 13
-; RV32ZICOND-NEXT:    mulhu a4, a0, a3
-; RV32ZICOND-NEXT:    mul a5, a1, a3
-; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    sltu a5, a4, a5
-; RV32ZICOND-NEXT:    mulhu a6, a1, a3
-; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    srai a1, a1, 31
+; RV32ZICOND-NEXT:    srai a4, a1, 31
+; RV32ZICOND-NEXT:    mulhu a5, a0, a3
 ; RV32ZICOND-NEXT:    mul a6, a1, a3
-; RV32ZICOND-NEXT:    add a6, a5, a6
-; RV32ZICOND-NEXT:    srai a7, a4, 31
-; RV32ZICOND-NEXT:    xor t0, a6, a7
-; RV32ZICOND-NEXT:    sltu a5, a6, a5
-; RV32ZICOND-NEXT:    mulh a1, a1, a3
-; RV32ZICOND-NEXT:    add a1, a1, a5
-; RV32ZICOND-NEXT:    xor a1, a1, a7
-; RV32ZICOND-NEXT:    or a1, t0, a1
-; RV32ZICOND-NEXT:    snez a1, a1
-; RV32ZICOND-NEXT:    mul a0, a0, a3
-; RV32ZICOND-NEXT:    sw a0, 0(a2)
-; RV32ZICOND-NEXT:    sw a4, 4(a2)
-; RV32ZICOND-NEXT:    mv a0, a1
+; RV32ZICOND-NEXT:    mulhu a1, a1, a3
+; RV32ZICOND-NEXT:    mul a7, a4, a3
+; RV32ZICOND-NEXT:    mulh a4, a4, a3
+; RV32ZICOND-NEXT:    mul a3, a0, a3
+; RV32ZICOND-NEXT:    add a5, a6, a5
+; RV32ZICOND-NEXT:    sltu a0, a5, a6
+; RV32ZICOND-NEXT:    srai a6, a5, 31
+; RV32ZICOND-NEXT:    add a0, a1, a0
+; RV32ZICOND-NEXT:    add a7, a0, a7
+; RV32ZICOND-NEXT:    xor a1, a7, a6
+; RV32ZICOND-NEXT:    sltu a0, a7, a0
+; RV32ZICOND-NEXT:    add a0, a4, a0
+; RV32ZICOND-NEXT:    xor a0, a0, a6
+; RV32ZICOND-NEXT:    or a0, a1, a0
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    sw a3, 0(a2)
+; RV32ZICOND-NEXT:    sw a5, 4(a2)
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: smulo2.i64:
@@ -1758,9 +1749,9 @@ define zeroext i1 @umulo2.i32(i32 signext %v1, ptr %res) {
 ; RV32ZBA-LABEL: umulo2.i32:
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    li a2, 13
+; RV32ZBA-NEXT:    sh1add a3, a0, a0
 ; RV32ZBA-NEXT:    mulhu a2, a0, a2
 ; RV32ZBA-NEXT:    snez a2, a2
-; RV32ZBA-NEXT:    sh1add a3, a0, a0
 ; RV32ZBA-NEXT:    sh2add a0, a3, a0
 ; RV32ZBA-NEXT:    sw a0, 0(a1)
 ; RV32ZBA-NEXT:    mv a0, a2
@@ -1878,24 +1869,23 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    mul a5, a3, a0
 ; RV32-NEXT:    mul a6, a1, a2
-; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    mulhu a6, a0, a2
-; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    sltu a6, a5, a6
-; RV32-NEXT:    snez a7, a3
-; RV32-NEXT:    snez t0, a1
-; RV32-NEXT:    and a7, t0, a7
-; RV32-NEXT:    mulhu a1, a1, a2
-; RV32-NEXT:    snez a1, a1
-; RV32-NEXT:    or a1, a7, a1
+; RV32-NEXT:    mulhu a7, a0, a2
+; RV32-NEXT:    snez t0, a3
 ; RV32-NEXT:    mulhu a3, a3, a0
-; RV32-NEXT:    snez a3, a3
-; RV32-NEXT:    or a1, a1, a3
-; RV32-NEXT:    or a1, a1, a6
-; RV32-NEXT:    mul a0, a0, a2
-; RV32-NEXT:    sw a0, 0(a4)
+; RV32-NEXT:    mul t1, a0, a2
+; RV32-NEXT:    mulhu a0, a1, a2
+; RV32-NEXT:    snez a1, a1
+; RV32-NEXT:    add a5, a6, a5
+; RV32-NEXT:    and a1, a1, t0
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    snez a2, a3
+; RV32-NEXT:    add a5, a7, a5
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    sltu a1, a5, a7
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    sw t1, 0(a4)
 ; RV32-NEXT:    sw a5, 4(a4)
-; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: umulo.i64:
@@ -1911,24 +1901,23 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    mul a5, a3, a0
 ; RV32ZBA-NEXT:    mul a6, a1, a2
-; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    mulhu a6, a0, a2
-; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    sltu a6, a5, a6
-; RV32ZBA-NEXT:    snez a7, a3
-; RV32ZBA-NEXT:    snez t0, a1
-; RV32ZBA-NEXT:    and a7, t0, a7
-; RV32ZBA-NEXT:    mulhu a1, a1, a2
-; RV32ZBA-NEXT:    snez a1, a1
-; RV32ZBA-NEXT:    or a1, a7, a1
+; RV32ZBA-NEXT:    mulhu a7, a0, a2
+; RV32ZBA-NEXT:    snez t0, a3
 ; RV32ZBA-NEXT:    mulhu a3, a3, a0
-; RV32ZBA-NEXT:    snez a3, a3
-; RV32ZBA-NEXT:    or a1, a1, a3
-; RV32ZBA-NEXT:    or a1, a1, a6
-; RV32ZBA-NEXT:    mul a0, a0, a2
-; RV32ZBA-NEXT:    sw a0, 0(a4)
+; RV32ZBA-NEXT:    mul t1, a0, a2
+; RV32ZBA-NEXT:    mulhu a0, a1, a2
+; RV32ZBA-NEXT:    snez a1, a1
+; RV32ZBA-NEXT:    add a5, a6, a5
+; RV32ZBA-NEXT:    and a1, a1, t0
+; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    snez a2, a3
+; RV32ZBA-NEXT:    add a5, a7, a5
+; RV32ZBA-NEXT:    or a0, a1, a0
+; RV32ZBA-NEXT:    sltu a1, a5, a7
+; RV32ZBA-NEXT:    or a0, a0, a2
+; RV32ZBA-NEXT:    or a0, a0, a1
+; RV32ZBA-NEXT:    sw t1, 0(a4)
 ; RV32ZBA-NEXT:    sw a5, 4(a4)
-; RV32ZBA-NEXT:    mv a0, a1
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: umulo.i64:
@@ -1944,24 +1933,23 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    mul a5, a3, a0
 ; RV32ZICOND-NEXT:    mul a6, a1, a2
-; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    mulhu a6, a0, a2
-; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    sltu a6, a5, a6
-; RV32ZICOND-NEXT:    snez a7, a3
-; RV32ZICOND-NEXT:    snez t0, a1
-; RV32ZICOND-NEXT:    and a7, t0, a7
-; RV32ZICOND-NEXT:    mulhu a1, a1, a2
-; RV32ZICOND-NEXT:    snez a1, a1
-; RV32ZICOND-NEXT:    or a1, a7, a1
+; RV32ZICOND-NEXT:    mulhu a7, a0, a2
+; RV32ZICOND-NEXT:    snez t0, a3
 ; RV32ZICOND-NEXT:    mulhu a3, a3, a0
-; RV32ZICOND-NEXT:    snez a3, a3
-; RV32ZICOND-NEXT:    or a1, a1, a3
-; RV32ZICOND-NEXT:    or a1, a1, a6
-; RV32ZICOND-NEXT:    mul a0, a0, a2
-; RV32ZICOND-NEXT:    sw a0, 0(a4)
+; RV32ZICOND-NEXT:    mul t1, a0, a2
+; RV32ZICOND-NEXT:    mulhu a0, a1, a2
+; RV32ZICOND-NEXT:    snez a1, a1
+; RV32ZICOND-NEXT:    add a5, a6, a5
+; RV32ZICOND-NEXT:    and a1, a1, t0
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    snez a2, a3
+; RV32ZICOND-NEXT:    add a5, a7, a5
+; RV32ZICOND-NEXT:    or a0, a1, a0
+; RV32ZICOND-NEXT:    sltu a1, a5, a7
+; RV32ZICOND-NEXT:    or a0, a0, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    sw t1, 0(a4)
 ; RV32ZICOND-NEXT:    sw a5, 4(a4)
-; RV32ZICOND-NEXT:    mv a0, a1
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo.i64:
@@ -1986,15 +1974,14 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; RV32-NEXT:    li a3, 13
 ; RV32-NEXT:    mul a4, a1, a3
 ; RV32-NEXT:    mulhu a5, a0, a3
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    sltu a5, a4, a5
 ; RV32-NEXT:    mulhu a1, a1, a3
-; RV32-NEXT:    snez a1, a1
-; RV32-NEXT:    or a1, a1, a5
-; RV32-NEXT:    mul a0, a0, a3
-; RV32-NEXT:    sw a0, 0(a2)
+; RV32-NEXT:    mul a3, a0, a3
+; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    snez a0, a1
+; RV32-NEXT:    sltu a1, a4, a5
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    sw a3, 0(a2)
 ; RV32-NEXT:    sw a4, 4(a2)
-; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: umulo2.i64:
@@ -2010,27 +1997,26 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; RV32ZBA-LABEL: umulo2.i64:
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    li a3, 13
-; RV32ZBA-NEXT:    mulhu a4, a0, a3
-; RV32ZBA-NEXT:    sh1add a5, a1, a1
-; RV32ZBA-NEXT:    sh2add a5, a5, a1
-; RV32ZBA-NEXT:    add a5, a4, a5
-; RV32ZBA-NEXT:    sltu a4, a5, a4
+; RV32ZBA-NEXT:    sh1add a4, a1, a1
+; RV32ZBA-NEXT:    sh1add a5, a0, a0
+; RV32ZBA-NEXT:    sh2add a4, a4, a1
 ; RV32ZBA-NEXT:    mulhu a1, a1, a3
-; RV32ZBA-NEXT:    snez a1, a1
-; RV32ZBA-NEXT:    or a1, a1, a4
-; RV32ZBA-NEXT:    sh1add a3, a0, a0
-; RV32ZBA-NEXT:    sh2add a0, a3, a0
-; RV32ZBA-NEXT:    sw a0, 0(a2)
-; RV32ZBA-NEXT:    sw a5, 4(a2)
-; RV32ZBA-NEXT:    mv a0, a1
+; RV32ZBA-NEXT:    mulhu a3, a0, a3
+; RV32ZBA-NEXT:    sh2add a5, a5, a0
+; RV32ZBA-NEXT:    add a4, a3, a4
+; RV32ZBA-NEXT:    snez a0, a1
+; RV32ZBA-NEXT:    sltu a1, a4, a3
+; RV32ZBA-NEXT:    or a0, a0, a1
+; RV32ZBA-NEXT:    sw a5, 0(a2)
+; RV32ZBA-NEXT:    sw a4, 4(a2)
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: umulo2.i64:
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    li a2, 13
+; RV64ZBA-NEXT:    sh1add a3, a0, a0
 ; RV64ZBA-NEXT:    mulhu a2, a0, a2
 ; RV64ZBA-NEXT:    snez a2, a2
-; RV64ZBA-NEXT:    sh1add a3, a0, a0
 ; RV64ZBA-NEXT:    sh2add a0, a3, a0
 ; RV64ZBA-NEXT:    sd a0, 0(a1)
 ; RV64ZBA-NEXT:    mv a0, a2
@@ -2041,15 +2027,14 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; RV32ZICOND-NEXT:    li a3, 13
 ; RV32ZICOND-NEXT:    mul a4, a1, a3
 ; RV32ZICOND-NEXT:    mulhu a5, a0, a3
-; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    sltu a5, a4, a5
 ; RV32ZICOND-NEXT:    mulhu a1, a1, a3
-; RV32ZICOND-NEXT:    snez a1, a1
-; RV32ZICOND-NEXT:    or a1, a1, a5
-; RV32ZICOND-NEXT:    mul a0, a0, a3
-; RV32ZICOND-NEXT:    sw a0, 0(a2)
+; RV32ZICOND-NEXT:    mul a3, a0, a3
+; RV32ZICOND-NEXT:    add a4, a5, a4
+; RV32ZICOND-NEXT:    snez a0, a1
+; RV32ZICOND-NEXT:    sltu a1, a4, a5
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    sw a3, 0(a2)
 ; RV32ZICOND-NEXT:    sw a4, 4(a2)
-; RV32ZICOND-NEXT:    mv a0, a1
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo2.i64:
@@ -2119,8 +2104,8 @@ define i32 @saddo.select.i32(i32 signext %v1, i32 signext %v2) {
 ; RV32ZICOND-LABEL: saddo.select.i32:
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    add a2, a0, a1
-; RV32ZICOND-NEXT:    slt a2, a2, a0
 ; RV32ZICOND-NEXT:    slti a3, a1, 0
+; RV32ZICOND-NEXT:    slt a2, a2, a0
 ; RV32ZICOND-NEXT:    xor a2, a3, a2
 ; RV32ZICOND-NEXT:    czero.nez a1, a1, a2
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a2
@@ -2208,8 +2193,8 @@ define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    add a5, a0, a2
 ; RV32-NEXT:    sltu a5, a5, a0
 ; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    xor a4, a1, a4
 ; RV32-NEXT:    xor a5, a1, a3
+; RV32-NEXT:    xor a4, a1, a4
 ; RV32-NEXT:    not a5, a5
 ; RV32-NEXT:    and a4, a5, a4
 ; RV32-NEXT:    bltz a4, .LBB30_2
@@ -2236,8 +2221,8 @@ define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    add a5, a0, a2
 ; RV32ZBA-NEXT:    sltu a5, a5, a0
 ; RV32ZBA-NEXT:    add a4, a4, a5
-; RV32ZBA-NEXT:    xor a4, a1, a4
 ; RV32ZBA-NEXT:    xor a5, a1, a3
+; RV32ZBA-NEXT:    xor a4, a1, a4
 ; RV32ZBA-NEXT:    not a5, a5
 ; RV32ZBA-NEXT:    and a4, a5, a4
 ; RV32ZBA-NEXT:    bltz a4, .LBB30_2
@@ -2264,24 +2249,24 @@ define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    add a5, a0, a2
 ; RV32ZICOND-NEXT:    sltu a5, a5, a0
 ; RV32ZICOND-NEXT:    add a4, a4, a5
-; RV32ZICOND-NEXT:    xor a4, a1, a4
 ; RV32ZICOND-NEXT:    xor a5, a1, a3
 ; RV32ZICOND-NEXT:    not a5, a5
+; RV32ZICOND-NEXT:    xor a4, a1, a4
 ; RV32ZICOND-NEXT:    and a4, a5, a4
 ; RV32ZICOND-NEXT:    slti a4, a4, 0
 ; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
-; RV32ZICOND-NEXT:    or a0, a0, a2
-; RV32ZICOND-NEXT:    czero.nez a2, a3, a4
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
 ; RV32ZICOND-NEXT:    czero.eqz a1, a1, a4
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a2
+; RV32ZICOND-NEXT:    or a1, a1, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: saddo.select.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
 ; RV64ZICOND-NEXT:    add a2, a0, a1
-; RV64ZICOND-NEXT:    slt a2, a2, a0
 ; RV64ZICOND-NEXT:    slti a3, a1, 0
+; RV64ZICOND-NEXT:    slt a2, a2, a0
 ; RV64ZICOND-NEXT:    xor a2, a3, a2
 ; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
 ; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
@@ -2299,11 +2284,11 @@ define i1 @saddo.not.i64(i64 %v1, i64 %v2) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    add a4, a1, a3
 ; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    xor a3, a1, a3
 ; RV32-NEXT:    sltu a0, a2, a0
 ; RV32-NEXT:    add a0, a4, a0
 ; RV32-NEXT:    xor a0, a1, a0
-; RV32-NEXT:    xor a1, a1, a3
-; RV32-NEXT:    not a1, a1
+; RV32-NEXT:    not a1, a3
 ; RV32-NEXT:    and a0, a1, a0
 ; RV32-NEXT:    slti a0, a0, 0
 ; RV32-NEXT:    xori a0, a0, 1
@@ -2322,11 +2307,11 @@ define i1 @saddo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    add a4, a1, a3
 ; RV32ZBA-NEXT:    add a2, a0, a2
+; RV32ZBA-NEXT:    xor a3, a1, a3
 ; RV32ZBA-NEXT:    sltu a0, a2, a0
 ; RV32ZBA-NEXT:    add a0, a4, a0
 ; RV32ZBA-NEXT:    xor a0, a1, a0
-; RV32ZBA-NEXT:    xor a1, a1, a3
-; RV32ZBA-NEXT:    not a1, a1
+; RV32ZBA-NEXT:    not a1, a3
 ; RV32ZBA-NEXT:    and a0, a1, a0
 ; RV32ZBA-NEXT:    slti a0, a0, 0
 ; RV32ZBA-NEXT:    xori a0, a0, 1
@@ -2345,11 +2330,11 @@ define i1 @saddo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    add a4, a1, a3
 ; RV32ZICOND-NEXT:    add a2, a0, a2
+; RV32ZICOND-NEXT:    xor a3, a1, a3
 ; RV32ZICOND-NEXT:    sltu a0, a2, a0
 ; RV32ZICOND-NEXT:    add a0, a4, a0
 ; RV32ZICOND-NEXT:    xor a0, a1, a0
-; RV32ZICOND-NEXT:    xor a1, a1, a3
-; RV32ZICOND-NEXT:    not a1, a1
+; RV32ZICOND-NEXT:    not a1, a3
 ; RV32ZICOND-NEXT:    and a0, a1, a0
 ; RV32ZICOND-NEXT:    slti a0, a0, 0
 ; RV32ZICOND-NEXT:    xori a0, a0, 1
@@ -2550,10 +2535,10 @@ define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    or a4, a5, a4
 ; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
-; RV32ZICOND-NEXT:    or a0, a0, a2
-; RV32ZICOND-NEXT:    czero.nez a2, a3, a4
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
 ; RV32ZICOND-NEXT:    czero.eqz a1, a1, a4
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a2
+; RV32ZICOND-NEXT:    or a1, a1, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: uaddo.select.i64:
@@ -2825,16 +2810,16 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    sltu a4, a0, a2
 ; RV32ZICOND-NEXT:    sub a5, a1, a3
 ; RV32ZICOND-NEXT:    sub a5, a5, a4
-; RV32ZICOND-NEXT:    xor a5, a1, a5
 ; RV32ZICOND-NEXT:    xor a4, a1, a3
+; RV32ZICOND-NEXT:    xor a5, a1, a5
 ; RV32ZICOND-NEXT:    and a4, a4, a5
 ; RV32ZICOND-NEXT:    slti a4, a4, 0
 ; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
-; RV32ZICOND-NEXT:    or a0, a0, a2
-; RV32ZICOND-NEXT:    czero.nez a2, a3, a4
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
 ; RV32ZICOND-NEXT:    czero.eqz a1, a1, a4
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a2
+; RV32ZICOND-NEXT:    or a1, a1, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: ssubo.select.i64:
@@ -3097,20 +3082,20 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    sltu a4, a0, a2
 ; RV32ZICOND-NEXT:    sub a5, a1, a3
+; RV32ZICOND-NEXT:    sub a6, a0, a2
 ; RV32ZICOND-NEXT:    sub a5, a5, a4
-; RV32ZICOND-NEXT:    xor a4, a5, a1
+; RV32ZICOND-NEXT:    sltu a4, a0, a6
+; RV32ZICOND-NEXT:    xor a6, a5, a1
 ; RV32ZICOND-NEXT:    sltu a5, a1, a5
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
-; RV32ZICOND-NEXT:    sub a6, a0, a2
-; RV32ZICOND-NEXT:    sltu a6, a0, a6
-; RV32ZICOND-NEXT:    czero.nez a4, a6, a4
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a6
+; RV32ZICOND-NEXT:    czero.nez a4, a4, a6
 ; RV32ZICOND-NEXT:    or a4, a4, a5
 ; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
-; RV32ZICOND-NEXT:    or a0, a0, a2
-; RV32ZICOND-NEXT:    czero.nez a2, a3, a4
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
 ; RV32ZICOND-NEXT:    czero.eqz a1, a1, a4
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a2
+; RV32ZICOND-NEXT:    or a1, a1, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: usubo.select.i64:
@@ -3179,13 +3164,13 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    sltu a4, a0, a2
 ; RV32ZICOND-NEXT:    sub a3, a1, a3
-; RV32ZICOND-NEXT:    sub a3, a3, a4
-; RV32ZICOND-NEXT:    xor a4, a3, a1
-; RV32ZICOND-NEXT:    sltu a1, a1, a3
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, a4
 ; RV32ZICOND-NEXT:    sub a2, a0, a2
+; RV32ZICOND-NEXT:    sub a3, a3, a4
 ; RV32ZICOND-NEXT:    sltu a0, a0, a2
-; RV32ZICOND-NEXT:    czero.nez a0, a0, a4
+; RV32ZICOND-NEXT:    xor a2, a3, a1
+; RV32ZICOND-NEXT:    sltu a1, a1, a3
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, a2
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
 ; RV32ZICOND-NEXT:    or a0, a0, a1
 ; RV32ZICOND-NEXT:    xori a0, a0, 1
 ; RV32ZICOND-NEXT:    ret
@@ -3340,46 +3325,46 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    .cfi_offset s0, -4
 ; RV32-NEXT:    mulhu a4, a0, a2
 ; RV32-NEXT:    mul a5, a1, a2
+; RV32-NEXT:    mulhu a6, a1, a2
+; RV32-NEXT:    mul a7, a0, a3
+; RV32-NEXT:    mulhu t0, a0, a3
+; RV32-NEXT:    mul t1, a1, a3
+; RV32-NEXT:    srai t2, a1, 31
+; RV32-NEXT:    srai t3, a3, 31
+; RV32-NEXT:    mulhu t4, a1, a3
 ; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    mul t5, a2, t2
+; RV32-NEXT:    mul t6, t3, a0
+; RV32-NEXT:    mul s0, t3, a1
+; RV32-NEXT:    mulhu t3, t3, a0
+; RV32-NEXT:    add t3, t3, s0
+; RV32-NEXT:    mulhu s0, a2, t2
+; RV32-NEXT:    mul t2, a3, t2
 ; RV32-NEXT:    sltu a5, a4, a5
-; RV32-NEXT:    mulhu a6, a1, a2
+; RV32-NEXT:    add a4, a7, a4
+; RV32-NEXT:    add s0, s0, t5
+; RV32-NEXT:    add t5, t6, t5
 ; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    mul a6, a0, a3
-; RV32-NEXT:    add a4, a6, a4
-; RV32-NEXT:    sltu a6, a4, a6
-; RV32-NEXT:    mulhu a7, a0, a3
-; RV32-NEXT:    add a6, a7, a6
+; RV32-NEXT:    sltu a6, a4, a7
+; RV32-NEXT:    add t2, s0, t2
+; RV32-NEXT:    add t3, t3, t6
+; RV32-NEXT:    sltu a7, t5, t6
+; RV32-NEXT:    srai a4, a4, 31
+; RV32-NEXT:    add a6, t0, a6
+; RV32-NEXT:    add t2, t3, t2
 ; RV32-NEXT:    add a6, a5, a6
-; RV32-NEXT:    mul a7, a1, a3
-; RV32-NEXT:    add t0, a7, a6
-; RV32-NEXT:    srai t1, a1, 31
-; RV32-NEXT:    mul t2, a2, t1
-; RV32-NEXT:    srai t3, a3, 31
-; RV32-NEXT:    mul t4, t3, a0
-; RV32-NEXT:    add t5, t4, t2
-; RV32-NEXT:    add t6, t0, t5
-; RV32-NEXT:    sltu s0, t6, t0
-; RV32-NEXT:    sltu a7, t0, a7
+; RV32-NEXT:    add a7, t2, a7
+; RV32-NEXT:    add t0, t1, a6
 ; RV32-NEXT:    sltu a5, a6, a5
-; RV32-NEXT:    mulhu a6, a1, a3
-; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    add a5, a5, a7
-; RV32-NEXT:    mulhu a6, a2, t1
-; RV32-NEXT:    add a6, a6, t2
-; RV32-NEXT:    mul a7, a3, t1
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    mul a7, t3, a1
-; RV32-NEXT:    mulhu t0, t3, a0
-; RV32-NEXT:    add a7, t0, a7
-; RV32-NEXT:    add a7, a7, t4
-; RV32-NEXT:    add a6, a7, a6
-; RV32-NEXT:    sltu a7, t5, t4
-; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    add t5, t0, t5
+; RV32-NEXT:    sltu a6, t0, t1
+; RV32-NEXT:    add a5, t4, a5
+; RV32-NEXT:    sltu t0, t5, t0
 ; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    add a5, a5, s0
-; RV32-NEXT:    srai a4, a4, 31
+; RV32-NEXT:    add a5, a5, a7
+; RV32-NEXT:    add a5, a5, t0
 ; RV32-NEXT:    xor a5, a5, a4
-; RV32-NEXT:    xor a4, t6, a4
+; RV32-NEXT:    xor a4, t5, a4
 ; RV32-NEXT:    or a4, a4, a5
 ; RV32-NEXT:    bnez a4, .LBB46_2
 ; RV32-NEXT:  # %bb.1: # %entry
@@ -3411,46 +3396,46 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    .cfi_offset s0, -4
 ; RV32ZBA-NEXT:    mulhu a4, a0, a2
 ; RV32ZBA-NEXT:    mul a5, a1, a2
+; RV32ZBA-NEXT:    mulhu a6, a1, a2
+; RV32ZBA-NEXT:    mul a7, a0, a3
+; RV32ZBA-NEXT:    mulhu t0, a0, a3
+; RV32ZBA-NEXT:    mul t1, a1, a3
+; RV32ZBA-NEXT:    srai t2, a1, 31
+; RV32ZBA-NEXT:    srai t3, a3, 31
+; RV32ZBA-NEXT:    mulhu t4, a1, a3
 ; RV32ZBA-NEXT:    add a4, a5, a4
+; RV32ZBA-NEXT:    mul t5, a2, t2
+; RV32ZBA-NEXT:    mul t6, t3, a0
+; RV32ZBA-NEXT:    mul s0, t3, a1
+; RV32ZBA-NEXT:    mulhu t3, t3, a0
+; RV32ZBA-NEXT:    add t3, t3, s0
+; RV32ZBA-NEXT:    mulhu s0, a2, t2
+; RV32ZBA-NEXT:    mul t2, a3, t2
 ; RV32ZBA-NEXT:    sltu a5, a4, a5
-; RV32ZBA-NEXT:    mulhu a6, a1, a2
+; RV32ZBA-NEXT:    add a4, a7, a4
+; RV32ZBA-NEXT:    add s0, s0, t5
+; RV32ZBA-NEXT:    add t5, t6, t5
 ; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    mul a6, a0, a3
-; RV32ZBA-NEXT:    add a4, a6, a4
-; RV32ZBA-NEXT:    sltu a6, a4, a6
-; RV32ZBA-NEXT:    mulhu a7, a0, a3
-; RV32ZBA-NEXT:    add a6, a7, a6
+; RV32ZBA-NEXT:    sltu a6, a4, a7
+; RV32ZBA-NEXT:    add t2, s0, t2
+; RV32ZBA-NEXT:    add t3, t3, t6
+; RV32ZBA-NEXT:    sltu a7, t5, t6
+; RV32ZBA-NEXT:    srai a4, a4, 31
+; RV32ZBA-NEXT:    add a6, t0, a6
+; RV32ZBA-NEXT:    add t2, t3, t2
 ; RV32ZBA-NEXT:    add a6, a5, a6
-; RV32ZBA-NEXT:    mul a7, a1, a3
-; RV32ZBA-NEXT:    add t0, a7, a6
-; RV32ZBA-NEXT:    srai t1, a1, 31
-; RV32ZBA-NEXT:    mul t2, a2, t1
-; RV32ZBA-NEXT:    srai t3, a3, 31
-; RV32ZBA-NEXT:    mul t4, t3, a0
-; RV32ZBA-NEXT:    add t5, t4, t2
-; RV32ZBA-NEXT:    add t6, t0, t5
-; RV32ZBA-NEXT:    sltu s0, t6, t0
-; RV32ZBA-NEXT:    sltu a7, t0, a7
+; RV32ZBA-NEXT:    add a7, t2, a7
+; RV32ZBA-NEXT:    add t0, t1, a6
 ; RV32ZBA-NEXT:    sltu a5, a6, a5
-; RV32ZBA-NEXT:    mulhu a6, a1, a3
-; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    add a5, a5, a7
-; RV32ZBA-NEXT:    mulhu a6, a2, t1
-; RV32ZBA-NEXT:    add a6, a6, t2
-; RV32ZBA-NEXT:    mul a7, a3, t1
-; RV32ZBA-NEXT:    add a6, a6, a7
-; RV32ZBA-NEXT:    mul a7, t3, a1
-; RV32ZBA-NEXT:    mulhu t0, t3, a0
-; RV32ZBA-NEXT:    add a7, t0, a7
-; RV32ZBA-NEXT:    add a7, a7, t4
-; RV32ZBA-NEXT:    add a6, a7, a6
-; RV32ZBA-NEXT:    sltu a7, t5, t4
-; RV32ZBA-NEXT:    add a6, a6, a7
+; RV32ZBA-NEXT:    add t5, t0, t5
+; RV32ZBA-NEXT:    sltu a6, t0, t1
+; RV32ZBA-NEXT:    add a5, t4, a5
+; RV32ZBA-NEXT:    sltu t0, t5, t0
 ; RV32ZBA-NEXT:    add a5, a5, a6
-; RV32ZBA-NEXT:    add a5, a5, s0
-; RV32ZBA-NEXT:    srai a4, a4, 31
+; RV32ZBA-NEXT:    add a5, a5, a7
+; RV32ZBA-NEXT:    add a5, a5, t0
 ; RV32ZBA-NEXT:    xor a5, a5, a4
-; RV32ZBA-NEXT:    xor a4, t6, a4
+; RV32ZBA-NEXT:    xor a4, t5, a4
 ; RV32ZBA-NEXT:    or a4, a4, a5
 ; RV32ZBA-NEXT:    bnez a4, .LBB46_2
 ; RV32ZBA-NEXT:  # %bb.1: # %entry
@@ -3482,53 +3467,53 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    .cfi_offset s0, -4
 ; RV32ZICOND-NEXT:    mulhu a4, a0, a2
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
+; RV32ZICOND-NEXT:    mulhu a6, a1, a2
+; RV32ZICOND-NEXT:    mul a7, a0, a3
+; RV32ZICOND-NEXT:    mulhu t0, a0, a3
+; RV32ZICOND-NEXT:    mul t1, a1, a3
+; RV32ZICOND-NEXT:    srai t2, a1, 31
+; RV32ZICOND-NEXT:    srai t3, a3, 31
+; RV32ZICOND-NEXT:    mulhu t4, a1, a3
 ; RV32ZICOND-NEXT:    add a4, a5, a4
+; RV32ZICOND-NEXT:    mul t5, a2, t2
+; RV32ZICOND-NEXT:    mul t6, t3, a0
+; RV32ZICOND-NEXT:    mul s0, t3, a1
+; RV32ZICOND-NEXT:    mulhu t3, t3, a0
+; RV32ZICOND-NEXT:    add t3, t3, s0
+; RV32ZICOND-NEXT:    mulhu s0, a2, t2
+; RV32ZICOND-NEXT:    mul t2, a3, t2
 ; RV32ZICOND-NEXT:    sltu a5, a4, a5
-; RV32ZICOND-NEXT:    mulhu a6, a1, a2
+; RV32ZICOND-NEXT:    add a4, a7, a4
+; RV32ZICOND-NEXT:    add s0, s0, t5
+; RV32ZICOND-NEXT:    add t5, t6, t5
 ; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    mul a6, a0, a3
-; RV32ZICOND-NEXT:    add a4, a6, a4
-; RV32ZICOND-NEXT:    sltu a6, a4, a6
-; RV32ZICOND-NEXT:    mulhu a7, a0, a3
-; RV32ZICOND-NEXT:    add a6, a7, a6
+; RV32ZICOND-NEXT:    sltu a6, a4, a7
+; RV32ZICOND-NEXT:    add t2, s0, t2
+; RV32ZICOND-NEXT:    add t3, t3, t6
+; RV32ZICOND-NEXT:    sltu a7, t5, t6
+; RV32ZICOND-NEXT:    srai a4, a4, 31
+; RV32ZICOND-NEXT:    add a6, t0, a6
+; RV32ZICOND-NEXT:    add t2, t3, t2
 ; RV32ZICOND-NEXT:    add a6, a5, a6
-; RV32ZICOND-NEXT:    mul a7, a1, a3
-; RV32ZICOND-NEXT:    add t0, a7, a6
-; RV32ZICOND-NEXT:    srai t1, a1, 31
-; RV32ZICOND-NEXT:    mul t2, a2, t1
-; RV32ZICOND-NEXT:    srai t3, a3, 31
-; RV32ZICOND-NEXT:    mul t4, t3, a0
-; RV32ZICOND-NEXT:    add t5, t4, t2
-; RV32ZICOND-NEXT:    add t6, t0, t5
-; RV32ZICOND-NEXT:    sltu s0, t6, t0
-; RV32ZICOND-NEXT:    sltu a7, t0, a7
+; RV32ZICOND-NEXT:    add a7, t2, a7
+; RV32ZICOND-NEXT:    add t0, t1, a6
 ; RV32ZICOND-NEXT:    sltu a5, a6, a5
-; RV32ZICOND-NEXT:    mulhu a6, a1, a3
-; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    add a5, a5, a7
-; RV32ZICOND-NEXT:    mulhu a6, a2, t1
-; RV32ZICOND-NEXT:    add a6, a6, t2
-; RV32ZICOND-NEXT:    mul a7, a3, t1
-; RV32ZICOND-NEXT:    add a6, a6, a7
-; RV32ZICOND-NEXT:    mul a7, t3, a1
-; RV32ZICOND-NEXT:    mulhu t0, t3, a0
-; RV32ZICOND-NEXT:    add a7, t0, a7
-; RV32ZICOND-NEXT:    add a7, a7, t4
-; RV32ZICOND-NEXT:    add a6, a7, a6
-; RV32ZICOND-NEXT:    sltu a7, t5, t4
-; RV32ZICOND-NEXT:    add a6, a6, a7
+; RV32ZICOND-NEXT:    add t5, t0, t5
+; RV32ZICOND-NEXT:    sltu a6, t0, t1
+; RV32ZICOND-NEXT:    add a5, t4, a5
+; RV32ZICOND-NEXT:    sltu t0, t5, t0
 ; RV32ZICOND-NEXT:    add a5, a5, a6
-; RV32ZICOND-NEXT:    add a5, a5, s0
-; RV32ZICOND-NEXT:    srai a4, a4, 31
-; RV32ZICOND-NEXT:    xor a5, a5, a4
-; RV32ZICOND-NEXT:    xor a4, t6, a4
-; RV32ZICOND-NEXT:    or a4, a4, a5
+; RV32ZICOND-NEXT:    xor a6, t5, a4
+; RV32ZICOND-NEXT:    add a5, a5, a7
+; RV32ZICOND-NEXT:    add a5, a5, t0
+; RV32ZICOND-NEXT:    xor a4, a5, a4
+; RV32ZICOND-NEXT:    or a4, a6, a4
 ; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
-; RV32ZICOND-NEXT:    or a0, a0, a2
-; RV32ZICOND-NEXT:    czero.nez a2, a3, a4
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
 ; RV32ZICOND-NEXT:    czero.eqz a1, a1, a4
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a2
+; RV32ZICOND-NEXT:    or a1, a1, a3
 ; RV32ZICOND-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
 ; RV32ZICOND-NEXT:    .cfi_restore s0
 ; RV32ZICOND-NEXT:    addi sp, sp, 16
@@ -3555,58 +3540,50 @@ entry:
 define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: smulo.not.i64:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset s0, -4
 ; RV32-NEXT:    mulhu a4, a0, a2
 ; RV32-NEXT:    mul a5, a1, a2
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    sltu a5, a4, a5
 ; RV32-NEXT:    mulhu a6, a1, a2
-; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    mul a6, a0, a3
-; RV32-NEXT:    add a4, a6, a4
-; RV32-NEXT:    sltu a6, a4, a6
-; RV32-NEXT:    mulhu a7, a0, a3
-; RV32-NEXT:    add a6, a7, a6
-; RV32-NEXT:    add a6, a5, a6
-; RV32-NEXT:    mul a7, a1, a3
-; RV32-NEXT:    add t0, a7, a6
-; RV32-NEXT:    srai t1, a1, 31
-; RV32-NEXT:    mul t2, a2, t1
+; RV32-NEXT:    mul a7, a0, a3
+; RV32-NEXT:    mulhu t0, a0, a3
+; RV32-NEXT:    mul t1, a1, a3
+; RV32-NEXT:    srai t2, a1, 31
 ; RV32-NEXT:    srai t3, a3, 31
-; RV32-NEXT:    mul t4, t3, a0
-; RV32-NEXT:    add t5, t4, t2
-; RV32-NEXT:    add t6, t0, t5
-; RV32-NEXT:    sltu s0, t6, t0
-; RV32-NEXT:    sltu a7, t0, a7
-; RV32-NEXT:    sltu a5, a6, a5
-; RV32-NEXT:    mulhu a6, a1, a3
-; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    add a5, a5, a7
-; RV32-NEXT:    mulhu a2, a2, t1
-; RV32-NEXT:    add a2, a2, t2
-; RV32-NEXT:    mul a3, a3, t1
-; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    mulhu t4, a1, a3
+; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    mul t5, a2, t2
+; RV32-NEXT:    mul t6, t3, a0
+; RV32-NEXT:    mulhu a2, a2, t2
+; RV32-NEXT:    mul a3, a3, t2
 ; RV32-NEXT:    mul a1, t3, a1
 ; RV32-NEXT:    mulhu a0, t3, a0
+; RV32-NEXT:    sltu a5, a4, a5
+; RV32-NEXT:    add a4, a7, a4
+; RV32-NEXT:    add t2, t6, t5
+; RV32-NEXT:    add a2, a2, t5
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a0, t4
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    sltu a1, t5, t4
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a5, a0
-; RV32-NEXT:    add a0, a0, s0
+; RV32-NEXT:    add a5, a6, a5
+; RV32-NEXT:    sltu a1, a4, a7
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a0, a0, t6
+; RV32-NEXT:    sltu a3, t2, t6
 ; RV32-NEXT:    srai a4, a4, 31
+; RV32-NEXT:    add a1, t0, a1
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a1, a5, a1
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    add a2, t1, a1
+; RV32-NEXT:    sltu a1, a1, a5
+; RV32-NEXT:    add t2, a2, t2
+; RV32-NEXT:    sltu a3, a2, t1
+; RV32-NEXT:    add a1, t4, a1
+; RV32-NEXT:    sltu a2, t2, a2
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    add a0, a0, a2
 ; RV32-NEXT:    xor a0, a0, a4
-; RV32-NEXT:    xor a1, t6, a4
+; RV32-NEXT:    xor a1, t2, a4
 ; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    seqz a0, a0
-; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    .cfi_restore s0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo.not.i64:
@@ -3620,58 +3597,50 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ;
 ; RV32ZBA-LABEL: smulo.not.i64:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    addi sp, sp, -16
-; RV32ZBA-NEXT:    .cfi_def_cfa_offset 16
-; RV32ZBA-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZBA-NEXT:    .cfi_offset s0, -4
 ; RV32ZBA-NEXT:    mulhu a4, a0, a2
 ; RV32ZBA-NEXT:    mul a5, a1, a2
-; RV32ZBA-NEXT:    add a4, a5, a4
-; RV32ZBA-NEXT:    sltu a5, a4, a5
 ; RV32ZBA-NEXT:    mulhu a6, a1, a2
-; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    mul a6, a0, a3
-; RV32ZBA-NEXT:    add a4, a6, a4
-; RV32ZBA-NEXT:    sltu a6, a4, a6
-; RV32ZBA-NEXT:    mulhu a7, a0, a3
-; RV32ZBA-NEXT:    add a6, a7, a6
-; RV32ZBA-NEXT:    add a6, a5, a6
-; RV32ZBA-NEXT:    mul a7, a1, a3
-; RV32ZBA-NEXT:    add t0, a7, a6
-; RV32ZBA-NEXT:    srai t1, a1, 31
-; RV32ZBA-NEXT:    mul t2, a2, t1
+; RV32ZBA-NEXT:    mul a7, a0, a3
+; RV32ZBA-NEXT:    mulhu t0, a0, a3
+; RV32ZBA-NEXT:    mul t1, a1, a3
+; RV32ZBA-NEXT:    srai t2, a1, 31
 ; RV32ZBA-NEXT:    srai t3, a3, 31
-; RV32ZBA-NEXT:    mul t4, t3, a0
-; RV32ZBA-NEXT:    add t5, t4, t2
-; RV32ZBA-NEXT:    add t6, t0, t5
-; RV32ZBA-NEXT:    sltu s0, t6, t0
-; RV32ZBA-NEXT:    sltu a7, t0, a7
-; RV32ZBA-NEXT:    sltu a5, a6, a5
-; RV32ZBA-NEXT:    mulhu a6, a1, a3
-; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    add a5, a5, a7
-; RV32ZBA-NEXT:    mulhu a2, a2, t1
-; RV32ZBA-NEXT:    add a2, a2, t2
-; RV32ZBA-NEXT:    mul a3, a3, t1
-; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    mulhu t4, a1, a3
+; RV32ZBA-NEXT:    add a4, a5, a4
+; RV32ZBA-NEXT:    mul t5, a2, t2
+; RV32ZBA-NEXT:    mul t6, t3, a0
+; RV32ZBA-NEXT:    mulhu a2, a2, t2
+; RV32ZBA-NEXT:    mul a3, a3, t2
 ; RV32ZBA-NEXT:    mul a1, t3, a1
 ; RV32ZBA-NEXT:    mulhu a0, t3, a0
+; RV32ZBA-NEXT:    sltu a5, a4, a5
+; RV32ZBA-NEXT:    add a4, a7, a4
+; RV32ZBA-NEXT:    add t2, t6, t5
+; RV32ZBA-NEXT:    add a2, a2, t5
 ; RV32ZBA-NEXT:    add a0, a0, a1
-; RV32ZBA-NEXT:    add a0, a0, t4
-; RV32ZBA-NEXT:    add a0, a0, a2
-; RV32ZBA-NEXT:    sltu a1, t5, t4
-; RV32ZBA-NEXT:    add a0, a0, a1
-; RV32ZBA-NEXT:    add a0, a5, a0
-; RV32ZBA-NEXT:    add a0, a0, s0
+; RV32ZBA-NEXT:    add a5, a6, a5
+; RV32ZBA-NEXT:    sltu a1, a4, a7
+; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    add a0, a0, t6
+; RV32ZBA-NEXT:    sltu a3, t2, t6
 ; RV32ZBA-NEXT:    srai a4, a4, 31
+; RV32ZBA-NEXT:    add a1, t0, a1
+; RV32ZBA-NEXT:    add a0, a0, a2
+; RV32ZBA-NEXT:    add a1, a5, a1
+; RV32ZBA-NEXT:    add a0, a0, a3
+; RV32ZBA-NEXT:    add a2, t1, a1
+; RV32ZBA-NEXT:    sltu a1, a1, a5
+; RV32ZBA-NEXT:    add t2, a2, t2
+; RV32ZBA-NEXT:    sltu a3, a2, t1
+; RV32ZBA-NEXT:    add a1, t4, a1
+; RV32ZBA-NEXT:    sltu a2, t2, a2
+; RV32ZBA-NEXT:    add a1, a1, a3
+; RV32ZBA-NEXT:    add a0, a1, a0
+; RV32ZBA-NEXT:    add a0, a0, a2
 ; RV32ZBA-NEXT:    xor a0, a0, a4
-; RV32ZBA-NEXT:    xor a1, t6, a4
+; RV32ZBA-NEXT:    xor a1, t2, a4
 ; RV32ZBA-NEXT:    or a0, a1, a0
 ; RV32ZBA-NEXT:    seqz a0, a0
-; RV32ZBA-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32ZBA-NEXT:    .cfi_restore s0
-; RV32ZBA-NEXT:    addi sp, sp, 16
-; RV32ZBA-NEXT:    .cfi_def_cfa_offset 0
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo.not.i64:
@@ -3685,58 +3654,50 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ;
 ; RV32ZICOND-LABEL: smulo.not.i64:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    addi sp, sp, -16
-; RV32ZICOND-NEXT:    .cfi_def_cfa_offset 16
-; RV32ZICOND-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZICOND-NEXT:    .cfi_offset s0, -4
 ; RV32ZICOND-NEXT:    mulhu a4, a0, a2
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
-; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    sltu a5, a4, a5
 ; RV32ZICOND-NEXT:    mulhu a6, a1, a2
-; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    mul a6, a0, a3
-; RV32ZICOND-NEXT:    add a4, a6, a4
-; RV32ZICOND-NEXT:    sltu a6, a4, a6
-; RV32ZICOND-NEXT:    mulhu a7, a0, a3
-; RV32ZICOND-NEXT:    add a6, a7, a6
-; RV32ZICOND-NEXT:    add a6, a5, a6
-; RV32ZICOND-NEXT:    mul a7, a1, a3
-; RV32ZICOND-NEXT:    add t0, a7, a6
-; RV32ZICOND-NEXT:    srai t1, a1, 31
-; RV32ZICOND-NEXT:    mul t2, a2, t1
+; RV32ZICOND-NEXT:    mul a7, a0, a3
+; RV32ZICOND-NEXT:    mulhu t0, a0, a3
+; RV32ZICOND-NEXT:    mul t1, a1, a3
+; RV32ZICOND-NEXT:    srai t2, a1, 31
 ; RV32ZICOND-NEXT:    srai t3, a3, 31
-; RV32ZICOND-NEXT:    mul t4, t3, a0
-; RV32ZICOND-NEXT:    add t5, t4, t2
-; RV32ZICOND-NEXT:    add t6, t0, t5
-; RV32ZICOND-NEXT:    sltu s0, t6, t0
-; RV32ZICOND-NEXT:    sltu a7, t0, a7
-; RV32ZICOND-NEXT:    sltu a5, a6, a5
-; RV32ZICOND-NEXT:    mulhu a6, a1, a3
-; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    add a5, a5, a7
-; RV32ZICOND-NEXT:    mulhu a2, a2, t1
-; RV32ZICOND-NEXT:    add a2, a2, t2
-; RV32ZICOND-NEXT:    mul a3, a3, t1
-; RV32ZICOND-NEXT:    add a2, a2, a3
+; RV32ZICOND-NEXT:    mulhu t4, a1, a3
+; RV32ZICOND-NEXT:    add a4, a5, a4
+; RV32ZICOND-NEXT:    mul t5, a2, t2
+; RV32ZICOND-NEXT:    mul t6, t3, a0
+; RV32ZICOND-NEXT:    mulhu a2, a2, t2
+; RV32ZICOND-NEXT:    mul a3, a3, t2
 ; RV32ZICOND-NEXT:    mul a1, t3, a1
 ; RV32ZICOND-NEXT:    mulhu a0, t3, a0
+; RV32ZICOND-NEXT:    sltu a5, a4, a5
+; RV32ZICOND-NEXT:    add a4, a7, a4
+; RV32ZICOND-NEXT:    add t2, t6, t5
+; RV32ZICOND-NEXT:    add a2, a2, t5
 ; RV32ZICOND-NEXT:    add a0, a0, a1
-; RV32ZICOND-NEXT:    add a0, a0, t4
-; RV32ZICOND-NEXT:    add a0, a0, a2
-; RV32ZICOND-NEXT:    sltu a1, t5, t4
-; RV32ZICOND-NEXT:    add a0, a0, a1
-; RV32ZICOND-NEXT:    add a0, a5, a0
-; RV32ZICOND-NEXT:    add a0, a0, s0
+; RV32ZICOND-NEXT:    add a5, a6, a5
+; RV32ZICOND-NEXT:    sltu a1, a4, a7
+; RV32ZICOND-NEXT:    add a2, a2, a3
+; RV32ZICOND-NEXT:    add a0, a0, t6
+; RV32ZICOND-NEXT:    sltu a3, t2, t6
 ; RV32ZICOND-NEXT:    srai a4, a4, 31
+; RV32ZICOND-NEXT:    add a1, t0, a1
+; RV32ZICOND-NEXT:    add a0, a0, a2
+; RV32ZICOND-NEXT:    add a1, a5, a1
+; RV32ZICOND-NEXT:    add a0, a0, a3
+; RV32ZICOND-NEXT:    add a2, t1, a1
+; RV32ZICOND-NEXT:    sltu a1, a1, a5
+; RV32ZICOND-NEXT:    add t2, a2, t2
+; RV32ZICOND-NEXT:    sltu a3, a2, t1
+; RV32ZICOND-NEXT:    add a1, t4, a1
+; RV32ZICOND-NEXT:    sltu a2, t2, a2
+; RV32ZICOND-NEXT:    add a1, a1, a3
+; RV32ZICOND-NEXT:    add a0, a1, a0
+; RV32ZICOND-NEXT:    add a0, a0, a2
 ; RV32ZICOND-NEXT:    xor a0, a0, a4
-; RV32ZICOND-NEXT:    xor a1, t6, a4
+; RV32ZICOND-NEXT:    xor a1, t2, a4
 ; RV32ZICOND-NEXT:    or a0, a1, a0
 ; RV32ZICOND-NEXT:    seqz a0, a0
-; RV32ZICOND-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32ZICOND-NEXT:    .cfi_restore s0
-; RV32ZICOND-NEXT:    addi sp, sp, 16
-; RV32ZICOND-NEXT:    .cfi_def_cfa_offset 0
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: smulo.not.i64:
@@ -3879,16 +3840,16 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    mul a4, a3, a0
 ; RV32-NEXT:    mul a5, a1, a2
+; RV32-NEXT:    snez a6, a3
 ; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    mulhu a5, a0, a2
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    sltu a4, a4, a5
-; RV32-NEXT:    snez a5, a3
-; RV32-NEXT:    snez a6, a1
-; RV32-NEXT:    and a5, a6, a5
+; RV32-NEXT:    snez a5, a1
+; RV32-NEXT:    and a5, a5, a6
 ; RV32-NEXT:    mulhu a6, a1, a2
 ; RV32-NEXT:    snez a6, a6
 ; RV32-NEXT:    or a5, a5, a6
+; RV32-NEXT:    mulhu a6, a0, a2
+; RV32-NEXT:    add a4, a6, a4
+; RV32-NEXT:    sltu a4, a4, a6
 ; RV32-NEXT:    mulhu a6, a3, a0
 ; RV32-NEXT:    snez a6, a6
 ; RV32-NEXT:    or a5, a5, a6
@@ -3913,16 +3874,16 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    mul a4, a3, a0
 ; RV32ZBA-NEXT:    mul a5, a1, a2
+; RV32ZBA-NEXT:    snez a6, a3
 ; RV32ZBA-NEXT:    add a4, a5, a4
-; RV32ZBA-NEXT:    mulhu a5, a0, a2
-; RV32ZBA-NEXT:    add a4, a5, a4
-; RV32ZBA-NEXT:    sltu a4, a4, a5
-; RV32ZBA-NEXT:    snez a5, a3
-; RV32ZBA-NEXT:    snez a6, a1
-; RV32ZBA-NEXT:    and a5, a6, a5
+; RV32ZBA-NEXT:    snez a5, a1
+; RV32ZBA-NEXT:    and a5, a5, a6
 ; RV32ZBA-NEXT:    mulhu a6, a1, a2
 ; RV32ZBA-NEXT:    snez a6, a6
 ; RV32ZBA-NEXT:    or a5, a5, a6
+; RV32ZBA-NEXT:    mulhu a6, a0, a2
+; RV32ZBA-NEXT:    add a4, a6, a4
+; RV32ZBA-NEXT:    sltu a4, a4, a6
 ; RV32ZBA-NEXT:    mulhu a6, a3, a0
 ; RV32ZBA-NEXT:    snez a6, a6
 ; RV32ZBA-NEXT:    or a5, a5, a6
@@ -3947,26 +3908,26 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    mul a4, a3, a0
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
+; RV32ZICOND-NEXT:    snez a6, a3
 ; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    mulhu a5, a0, a2
-; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    sltu a4, a4, a5
-; RV32ZICOND-NEXT:    snez a5, a3
-; RV32ZICOND-NEXT:    snez a6, a1
-; RV32ZICOND-NEXT:    and a5, a6, a5
+; RV32ZICOND-NEXT:    snez a5, a1
+; RV32ZICOND-NEXT:    and a5, a5, a6
 ; RV32ZICOND-NEXT:    mulhu a6, a1, a2
 ; RV32ZICOND-NEXT:    snez a6, a6
 ; RV32ZICOND-NEXT:    or a5, a5, a6
+; RV32ZICOND-NEXT:    mulhu a6, a0, a2
+; RV32ZICOND-NEXT:    add a4, a6, a4
+; RV32ZICOND-NEXT:    sltu a4, a4, a6
 ; RV32ZICOND-NEXT:    mulhu a6, a3, a0
 ; RV32ZICOND-NEXT:    snez a6, a6
 ; RV32ZICOND-NEXT:    or a5, a5, a6
 ; RV32ZICOND-NEXT:    or a4, a5, a4
 ; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
-; RV32ZICOND-NEXT:    or a0, a0, a2
-; RV32ZICOND-NEXT:    czero.nez a2, a3, a4
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
 ; RV32ZICOND-NEXT:    czero.eqz a1, a1, a4
-; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    or a0, a0, a2
+; RV32ZICOND-NEXT:    or a1, a1, a3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo.select.i64:
@@ -3988,20 +3949,20 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    mul a4, a3, a0
 ; RV32-NEXT:    mul a5, a1, a2
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    mulhu a5, a0, a2
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    sltu a4, a4, a5
-; RV32-NEXT:    snez a5, a3
-; RV32-NEXT:    snez a6, a1
-; RV32-NEXT:    and a5, a6, a5
-; RV32-NEXT:    mulhu a1, a1, a2
-; RV32-NEXT:    snez a1, a1
-; RV32-NEXT:    or a1, a5, a1
+; RV32-NEXT:    mulhu a6, a0, a2
 ; RV32-NEXT:    mulhu a0, a3, a0
+; RV32-NEXT:    snez a3, a3
+; RV32-NEXT:    mulhu a2, a1, a2
+; RV32-NEXT:    snez a1, a1
+; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    and a1, a1, a3
+; RV32-NEXT:    snez a2, a2
 ; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    add a4, a6, a4
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    sltu a2, a4, a6
 ; RV32-NEXT:    or a0, a1, a0
-; RV32-NEXT:    or a0, a0, a4
+; RV32-NEXT:    or a0, a0, a2
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:    ret
 ;
@@ -4015,20 +3976,20 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    mul a4, a3, a0
 ; RV32ZBA-NEXT:    mul a5, a1, a2
-; RV32ZBA-NEXT:    add a4, a5, a4
-; RV32ZBA-NEXT:    mulhu a5, a0, a2
-; RV32ZBA-NEXT:    add a4, a5, a4
-; RV32ZBA-NEXT:    sltu a4, a4, a5
-; RV32ZBA-NEXT:    snez a5, a3
-; RV32ZBA-NEXT:    snez a6, a1
-; RV32ZBA-NEXT:    and a5, a6, a5
-; RV32ZBA-NEXT:    mulhu a1, a1, a2
-; RV32ZBA-NEXT:    snez a1, a1
-; RV32ZBA-NEXT:    or a1, a5, a1
+; RV32ZBA-NEXT:    mulhu a6, a0, a2
 ; RV32ZBA-NEXT:    mulhu a0, a3, a0
+; RV32ZBA-NEXT:    snez a3, a3
+; RV32ZBA-NEXT:    mulhu a2, a1, a2
+; RV32ZBA-NEXT:    snez a1, a1
+; RV32ZBA-NEXT:    add a4, a5, a4
+; RV32ZBA-NEXT:    and a1, a1, a3
+; RV32ZBA-NEXT:    snez a2, a2
 ; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    add a4, a6, a4
+; RV32ZBA-NEXT:    or a1, a1, a2
+; RV32ZBA-NEXT:    sltu a2, a4, a6
 ; RV32ZBA-NEXT:    or a0, a1, a0
-; RV32ZBA-NEXT:    or a0, a0, a4
+; RV32ZBA-NEXT:    or a0, a0, a2
 ; RV32ZBA-NEXT:    xori a0, a0, 1
 ; RV32ZBA-NEXT:    ret
 ;
@@ -4042,20 +4003,20 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    mul a4, a3, a0
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
-; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    mulhu a5, a0, a2
-; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    sltu a4, a4, a5
-; RV32ZICOND-NEXT:    snez a5, a3
-; RV32ZICOND-NEXT:    snez a6, a1
-; RV32ZICOND-NEXT:    and a5, a6, a5
-; RV32ZICOND-NEXT:    mulhu a1, a1, a2
-; RV32ZICOND-NEXT:    snez a1, a1
-; RV32ZICOND-NEXT:    or a1, a5, a1
+; RV32ZICOND-NEXT:    mulhu a6, a0, a2
 ; RV32ZICOND-NEXT:    mulhu a0, a3, a0
+; RV32ZICOND-NEXT:    snez a3, a3
+; RV32ZICOND-NEXT:    mulhu a2, a1, a2
+; RV32ZICOND-NEXT:    snez a1, a1
+; RV32ZICOND-NEXT:    add a4, a5, a4
+; RV32ZICOND-NEXT:    and a1, a1, a3
+; RV32ZICOND-NEXT:    snez a2, a2
 ; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    add a4, a6, a4
+; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    sltu a2, a4, a6
 ; RV32ZICOND-NEXT:    or a0, a1, a0
-; RV32ZICOND-NEXT:    or a0, a0, a4
+; RV32ZICOND-NEXT:    or a0, a0, a2
 ; RV32ZICOND-NEXT:    xori a0, a0, 1
 ; RV32ZICOND-NEXT:    ret
 ;
@@ -4168,11 +4129,11 @@ define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    add a4, a1, a3
 ; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    xor a3, a1, a3
 ; RV32-NEXT:    sltu a0, a2, a0
 ; RV32-NEXT:    add a0, a4, a0
 ; RV32-NEXT:    xor a0, a1, a0
-; RV32-NEXT:    xor a1, a1, a3
-; RV32-NEXT:    not a1, a1
+; RV32-NEXT:    not a1, a3
 ; RV32-NEXT:    and a0, a1, a0
 ; RV32-NEXT:    bgez a0, .LBB53_2
 ; RV32-NEXT:  # %bb.1: # %overflow
@@ -4199,11 +4160,11 @@ define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    add a4, a1, a3
 ; RV32ZBA-NEXT:    add a2, a0, a2
+; RV32ZBA-NEXT:    xor a3, a1, a3
 ; RV32ZBA-NEXT:    sltu a0, a2, a0
 ; RV32ZBA-NEXT:    add a0, a4, a0
 ; RV32ZBA-NEXT:    xor a0, a1, a0
-; RV32ZBA-NEXT:    xor a1, a1, a3
-; RV32ZBA-NEXT:    not a1, a1
+; RV32ZBA-NEXT:    not a1, a3
 ; RV32ZBA-NEXT:    and a0, a1, a0
 ; RV32ZBA-NEXT:    bgez a0, .LBB53_2
 ; RV32ZBA-NEXT:  # %bb.1: # %overflow
@@ -4230,11 +4191,11 @@ define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    add a4, a1, a3
 ; RV32ZICOND-NEXT:    add a2, a0, a2
+; RV32ZICOND-NEXT:    xor a3, a1, a3
 ; RV32ZICOND-NEXT:    sltu a0, a2, a0
 ; RV32ZICOND-NEXT:    add a0, a4, a0
 ; RV32ZICOND-NEXT:    xor a0, a1, a0
-; RV32ZICOND-NEXT:    xor a1, a1, a3
-; RV32ZICOND-NEXT:    not a1, a1
+; RV32ZICOND-NEXT:    not a1, a3
 ; RV32ZICOND-NEXT:    and a0, a1, a0
 ; RV32ZICOND-NEXT:    bgez a0, .LBB53_2
 ; RV32ZICOND-NEXT:  # %bb.1: # %overflow
@@ -4786,13 +4747,13 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    sltu a4, a0, a2
 ; RV32ZICOND-NEXT:    sub a3, a1, a3
-; RV32ZICOND-NEXT:    sub a3, a3, a4
-; RV32ZICOND-NEXT:    xor a4, a3, a1
-; RV32ZICOND-NEXT:    sltu a1, a1, a3
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, a4
 ; RV32ZICOND-NEXT:    sub a2, a0, a2
+; RV32ZICOND-NEXT:    sub a3, a3, a4
 ; RV32ZICOND-NEXT:    sltu a0, a0, a2
-; RV32ZICOND-NEXT:    czero.nez a0, a0, a4
+; RV32ZICOND-NEXT:    xor a2, a3, a1
+; RV32ZICOND-NEXT:    sltu a1, a1, a3
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, a2
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
 ; RV32ZICOND-NEXT:    or a0, a0, a1
 ; RV32ZICOND-NEXT:    beqz a0, .LBB59_2
 ; RV32ZICOND-NEXT:  # %bb.1: # %overflow
@@ -4916,64 +4877,55 @@ continue:
 define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: smulo.br.i64:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset s0, -4
 ; RV32-NEXT:    mulhu a4, a0, a2
 ; RV32-NEXT:    mul a5, a1, a2
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    sltu a5, a4, a5
 ; RV32-NEXT:    mulhu a6, a1, a2
-; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    mul a6, a0, a3
-; RV32-NEXT:    add a4, a6, a4
-; RV32-NEXT:    sltu a6, a4, a6
-; RV32-NEXT:    mulhu a7, a0, a3
-; RV32-NEXT:    add a6, a7, a6
-; RV32-NEXT:    add a6, a5, a6
-; RV32-NEXT:    mul a7, a1, a3
-; RV32-NEXT:    add t0, a7, a6
-; RV32-NEXT:    srai t1, a1, 31
-; RV32-NEXT:    mul t2, a2, t1
+; RV32-NEXT:    mul a7, a0, a3
+; RV32-NEXT:    mulhu t0, a0, a3
+; RV32-NEXT:    mul t1, a1, a3
+; RV32-NEXT:    srai t2, a1, 31
 ; RV32-NEXT:    srai t3, a3, 31
-; RV32-NEXT:    mul t4, t3, a0
-; RV32-NEXT:    add t5, t4, t2
-; RV32-NEXT:    add t6, t0, t5
-; RV32-NEXT:    sltu s0, t6, t0
-; RV32-NEXT:    sltu a7, t0, a7
-; RV32-NEXT:    sltu a5, a6, a5
-; RV32-NEXT:    mulhu a6, a1, a3
-; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    add a5, a5, a7
-; RV32-NEXT:    mulhu a2, a2, t1
-; RV32-NEXT:    add a2, a2, t2
-; RV32-NEXT:    mul a3, a3, t1
-; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    mulhu t4, a1, a3
+; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    mul t5, a2, t2
+; RV32-NEXT:    mul t6, t3, a0
+; RV32-NEXT:    mulhu a2, a2, t2
+; RV32-NEXT:    mul a3, a3, t2
 ; RV32-NEXT:    mul a1, t3, a1
 ; RV32-NEXT:    mulhu a0, t3, a0
+; RV32-NEXT:    sltu a5, a4, a5
+; RV32-NEXT:    add a4, a7, a4
+; RV32-NEXT:    add t2, t6, t5
+; RV32-NEXT:    add a2, a2, t5
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a0, t4
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    sltu a1, t5, t4
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a0, a5, a0
-; RV32-NEXT:    add a0, a0, s0
+; RV32-NEXT:    add a5, a6, a5
+; RV32-NEXT:    sltu a1, a4, a7
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a0, a0, t6
+; RV32-NEXT:    sltu a3, t2, t6
 ; RV32-NEXT:    srai a4, a4, 31
+; RV32-NEXT:    add a1, t0, a1
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    add a1, a5, a1
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    add a2, t1, a1
+; RV32-NEXT:    sltu a1, a1, a5
+; RV32-NEXT:    add t2, a2, t2
+; RV32-NEXT:    sltu a3, a2, t1
+; RV32-NEXT:    add a1, t4, a1
+; RV32-NEXT:    sltu a2, t2, a2
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    add a0, a0, a2
 ; RV32-NEXT:    xor a0, a0, a4
-; RV32-NEXT:    xor a1, t6, a4
+; RV32-NEXT:    xor a1, t2, a4
 ; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    beqz a0, .LBB61_2
 ; RV32-NEXT:  # %bb.1: # %overflow
 ; RV32-NEXT:    li a0, 0
-; RV32-NEXT:    j .LBB61_3
+; RV32-NEXT:    ret
 ; RV32-NEXT:  .LBB61_2: # %continue
 ; RV32-NEXT:    li a0, 1
-; RV32-NEXT:  .LBB61_3: # %overflow
-; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT:    .cfi_restore s0
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo.br.i64:
@@ -4991,64 +4943,55 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ;
 ; RV32ZBA-LABEL: smulo.br.i64:
 ; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    addi sp, sp, -16
-; RV32ZBA-NEXT:    .cfi_def_cfa_offset 16
-; RV32ZBA-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZBA-NEXT:    .cfi_offset s0, -4
 ; RV32ZBA-NEXT:    mulhu a4, a0, a2
 ; RV32ZBA-NEXT:    mul a5, a1, a2
-; RV32ZBA-NEXT:    add a4, a5, a4
-; RV32ZBA-NEXT:    sltu a5, a4, a5
 ; RV32ZBA-NEXT:    mulhu a6, a1, a2
-; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    mul a6, a0, a3
-; RV32ZBA-NEXT:    add a4, a6, a4
-; RV32ZBA-NEXT:    sltu a6, a4, a6
-; RV32ZBA-NEXT:    mulhu a7, a0, a3
-; RV32ZBA-NEXT:    add a6, a7, a6
-; RV32ZBA-NEXT:    add a6, a5, a6
-; RV32ZBA-NEXT:    mul a7, a1, a3
-; RV32ZBA-NEXT:    add t0, a7, a6
-; RV32ZBA-NEXT:    srai t1, a1, 31
-; RV32ZBA-NEXT:    mul t2, a2, t1
+; RV32ZBA-NEXT:    mul a7, a0, a3
+; RV32ZBA-NEXT:    mulhu t0, a0, a3
+; RV32ZBA-NEXT:    mul t1, a1, a3
+; RV32ZBA-NEXT:    srai t2, a1, 31
 ; RV32ZBA-NEXT:    srai t3, a3, 31
-; RV32ZBA-NEXT:    mul t4, t3, a0
-; RV32ZBA-NEXT:    add t5, t4, t2
-; RV32ZBA-NEXT:    add t6, t0, t5
-; RV32ZBA-NEXT:    sltu s0, t6, t0
-; RV32ZBA-NEXT:    sltu a7, t0, a7
-; RV32ZBA-NEXT:    sltu a5, a6, a5
-; RV32ZBA-NEXT:    mulhu a6, a1, a3
-; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    add a5, a5, a7
-; RV32ZBA-NEXT:    mulhu a2, a2, t1
-; RV32ZBA-NEXT:    add a2, a2, t2
-; RV32ZBA-NEXT:    mul a3, a3, t1
-; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    mulhu t4, a1, a3
+; RV32ZBA-NEXT:    add a4, a5, a4
+; RV32ZBA-NEXT:    mul t5, a2, t2
+; RV32ZBA-NEXT:    mul t6, t3, a0
+; RV32ZBA-NEXT:    mulhu a2, a2, t2
+; RV32ZBA-NEXT:    mul a3, a3, t2
 ; RV32ZBA-NEXT:    mul a1, t3, a1
 ; RV32ZBA-NEXT:    mulhu a0, t3, a0
+; RV32ZBA-NEXT:    sltu a5, a4, a5
+; RV32ZBA-NEXT:    add a4, a7, a4
+; RV32ZBA-NEXT:    add t2, t6, t5
+; RV32ZBA-NEXT:    add a2, a2, t5
 ; RV32ZBA-NEXT:    add a0, a0, a1
-; RV32ZBA-NEXT:    add a0, a0, t4
-; RV32ZBA-NEXT:    add a0, a0, a2
-; RV32ZBA-NEXT:    sltu a1, t5, t4
-; RV32ZBA-NEXT:    add a0, a0, a1
-; RV32ZBA-NEXT:    add a0, a5, a0
-; RV32ZBA-NEXT:    add a0, a0, s0
+; RV32ZBA-NEXT:    add a5, a6, a5
+; RV32ZBA-NEXT:    sltu a1, a4, a7
+; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    add a0, a0, t6
+; RV32ZBA-NEXT:    sltu a3, t2, t6
 ; RV32ZBA-NEXT:    srai a4, a4, 31
+; RV32ZBA-NEXT:    add a1, t0, a1
+; RV32ZBA-NEXT:    add a0, a0, a2
+; RV32ZBA-NEXT:    add a1, a5, a1
+; RV32ZBA-NEXT:    add a0, a0, a3
+; RV32ZBA-NEXT:    add a2, t1, a1
+; RV32ZBA-NEXT:    sltu a1, a1, a5
+; RV32ZBA-NEXT:    add t2, a2, t2
+; RV32ZBA-NEXT:    sltu a3, a2, t1
+; RV32ZBA-NEXT:    add a1, t4, a1
+; RV32ZBA-NEXT:    sltu a2, t2, a2
+; RV32ZBA-NEXT:    add a1, a1, a3
+; RV32ZBA-NEXT:    add a0, a1, a0
+; RV32ZBA-NEXT:    add a0, a0, a2
 ; RV32ZBA-NEXT:    xor a0, a0, a4
-; RV32ZBA-NEXT:    xor a1, t6, a4
+; RV32ZBA-NEXT:    xor a1, t2, a4
 ; RV32ZBA-NEXT:    or a0, a1, a0
 ; RV32ZBA-NEXT:    beqz a0, .LBB61_2
 ; RV32ZBA-NEXT:  # %bb.1: # %overflow
 ; RV32ZBA-NEXT:    li a0, 0
-; RV32ZBA-NEXT:    j .LBB61_3
+; RV32ZBA-NEXT:    ret
 ; RV32ZBA-NEXT:  .LBB61_2: # %continue
 ; RV32ZBA-NEXT:    li a0, 1
-; RV32ZBA-NEXT:  .LBB61_3: # %overflow
-; RV32ZBA-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32ZBA-NEXT:    .cfi_restore s0
-; RV32ZBA-NEXT:    addi sp, sp, 16
-; RV32ZBA-NEXT:    .cfi_def_cfa_offset 0
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo.br.i64:
@@ -5066,64 +5009,55 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ;
 ; RV32ZICOND-LABEL: smulo.br.i64:
 ; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    addi sp, sp, -16
-; RV32ZICOND-NEXT:    .cfi_def_cfa_offset 16
-; RV32ZICOND-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
-; RV32ZICOND-NEXT:    .cfi_offset s0, -4
 ; RV32ZICOND-NEXT:    mulhu a4, a0, a2
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
-; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    sltu a5, a4, a5
 ; RV32ZICOND-NEXT:    mulhu a6, a1, a2
-; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    mul a6, a0, a3
-; RV32ZICOND-NEXT:    add a4, a6, a4
-; RV32ZICOND-NEXT:    sltu a6, a4, a6
-; RV32ZICOND-NEXT:    mulhu a7, a0, a3
-; RV32ZICOND-NEXT:    add a6, a7, a6
-; RV32ZICOND-NEXT:    add a6, a5, a6
-; RV32ZICOND-NEXT:    mul a7, a1, a3
-; RV32ZICOND-NEXT:    add t0, a7, a6
-; RV32ZICOND-NEXT:    srai t1, a1, 31
-; RV32ZICOND-NEXT:    mul t2, a2, t1
+; RV32ZICOND-NEXT:    mul a7, a0, a3
+; RV32ZICOND-NEXT:    mulhu t0, a0, a3
+; RV32ZICOND-NEXT:    mul t1, a1, a3
+; RV32ZICOND-NEXT:    srai t2, a1, 31
 ; RV32ZICOND-NEXT:    srai t3, a3, 31
-; RV32ZICOND-NEXT:    mul t4, t3, a0
-; RV32ZICOND-NEXT:    add t5, t4, t2
-; RV32ZICOND-NEXT:    add t6, t0, t5
-; RV32ZICOND-NEXT:    sltu s0, t6, t0
-; RV32ZICOND-NEXT:    sltu a7, t0, a7
-; RV32ZICOND-NEXT:    sltu a5, a6, a5
-; RV32ZICOND-NEXT:    mulhu a6, a1, a3
-; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    add a5, a5, a7
-; RV32ZICOND-NEXT:    mulhu a2, a2, t1
-; RV32ZICOND-NEXT:    add a2, a2, t2
-; RV32ZICOND-NEXT:    mul a3, a3, t1
-; RV32ZICOND-NEXT:    add a2, a2, a3
+; RV32ZICOND-NEXT:    mulhu t4, a1, a3
+; RV32ZICOND-NEXT:    add a4, a5, a4
+; RV32ZICOND-NEXT:    mul t5, a2, t2
+; RV32ZICOND-NEXT:    mul t6, t3, a0
+; RV32ZICOND-NEXT:    mulhu a2, a2, t2
+; RV32ZICOND-NEXT:    mul a3, a3, t2
 ; RV32ZICOND-NEXT:    mul a1, t3, a1
 ; RV32ZICOND-NEXT:    mulhu a0, t3, a0
+; RV32ZICOND-NEXT:    sltu a5, a4, a5
+; RV32ZICOND-NEXT:    add a4, a7, a4
+; RV32ZICOND-NEXT:    add t2, t6, t5
+; RV32ZICOND-NEXT:    add a2, a2, t5
 ; RV32ZICOND-NEXT:    add a0, a0, a1
-; RV32ZICOND-NEXT:    add a0, a0, t4
-; RV32ZICOND-NEXT:    add a0, a0, a2
-; RV32ZICOND-NEXT:    sltu a1, t5, t4
-; RV32ZICOND-NEXT:    add a0, a0, a1
-; RV32ZICOND-NEXT:    add a0, a5, a0
-; RV32ZICOND-NEXT:    add a0, a0, s0
+; RV32ZICOND-NEXT:    add a5, a6, a5
+; RV32ZICOND-NEXT:    sltu a1, a4, a7
+; RV32ZICOND-NEXT:    add a2, a2, a3
+; RV32ZICOND-NEXT:    add a0, a0, t6
+; RV32ZICOND-NEXT:    sltu a3, t2, t6
 ; RV32ZICOND-NEXT:    srai a4, a4, 31
+; RV32ZICOND-NEXT:    add a1, t0, a1
+; RV32ZICOND-NEXT:    add a0, a0, a2
+; RV32ZICOND-NEXT:    add a1, a5, a1
+; RV32ZICOND-NEXT:    add a0, a0, a3
+; RV32ZICOND-NEXT:    add a2, t1, a1
+; RV32ZICOND-NEXT:    sltu a1, a1, a5
+; RV32ZICOND-NEXT:    add t2, a2, t2
+; RV32ZICOND-NEXT:    sltu a3, a2, t1
+; RV32ZICOND-NEXT:    add a1, t4, a1
+; RV32ZICOND-NEXT:    sltu a2, t2, a2
+; RV32ZICOND-NEXT:    add a1, a1, a3
+; RV32ZICOND-NEXT:    add a0, a1, a0
+; RV32ZICOND-NEXT:    add a0, a0, a2
 ; RV32ZICOND-NEXT:    xor a0, a0, a4
-; RV32ZICOND-NEXT:    xor a1, t6, a4
+; RV32ZICOND-NEXT:    xor a1, t2, a4
 ; RV32ZICOND-NEXT:    or a0, a1, a0
 ; RV32ZICOND-NEXT:    beqz a0, .LBB61_2
 ; RV32ZICOND-NEXT:  # %bb.1: # %overflow
 ; RV32ZICOND-NEXT:    li a0, 0
-; RV32ZICOND-NEXT:    j .LBB61_3
+; RV32ZICOND-NEXT:    ret
 ; RV32ZICOND-NEXT:  .LBB61_2: # %continue
 ; RV32ZICOND-NEXT:    li a0, 1
-; RV32ZICOND-NEXT:  .LBB61_3: # %overflow
-; RV32ZICOND-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
-; RV32ZICOND-NEXT:    .cfi_restore s0
-; RV32ZICOND-NEXT:    addi sp, sp, 16
-; RV32ZICOND-NEXT:    .cfi_def_cfa_offset 0
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: smulo.br.i64:
@@ -5155,43 +5089,43 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; RV32-LABEL: smulo2.br.i64:
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    li a2, -13
-; RV32-NEXT:    mulhu a3, a0, a2
-; RV32-NEXT:    mul a4, a1, a2
-; RV32-NEXT:    add a3, a4, a3
-; RV32-NEXT:    sltu a4, a3, a4
-; RV32-NEXT:    mulhu a5, a1, a2
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    sub a3, a3, a0
-; RV32-NEXT:    neg a5, a0
-; RV32-NEXT:    sltu a6, a3, a5
-; RV32-NEXT:    li a7, -1
-; RV32-NEXT:    mulhu t0, a0, a7
-; RV32-NEXT:    add a6, t0, a6
-; RV32-NEXT:    add a6, a4, a6
-; RV32-NEXT:    sub t1, a6, a1
-; RV32-NEXT:    srai t2, a1, 31
-; RV32-NEXT:    mul t3, t2, a2
-; RV32-NEXT:    sub t3, t3, a0
-; RV32-NEXT:    add t4, t1, t3
-; RV32-NEXT:    sltu t5, t4, t1
-; RV32-NEXT:    neg t6, a1
-; RV32-NEXT:    sltu t1, t1, t6
-; RV32-NEXT:    sltu a4, a6, a4
-; RV32-NEXT:    mulhu a6, a1, a7
-; RV32-NEXT:    add a4, a6, a4
-; RV32-NEXT:    add a4, a4, t1
-; RV32-NEXT:    sltu a5, t3, a5
-; RV32-NEXT:    mulh a2, t2, a2
-; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    neg a3, a0
+; RV32-NEXT:    li a4, -1
+; RV32-NEXT:    srai a5, a1, 31
+; RV32-NEXT:    neg a6, a1
+; RV32-NEXT:    add a7, a0, a1
+; RV32-NEXT:    mulhu t0, a0, a2
+; RV32-NEXT:    mul t1, a1, a2
+; RV32-NEXT:    mulhu t2, a1, a2
+; RV32-NEXT:    mulhu t3, a0, a4
+; RV32-NEXT:    mul t4, a5, a2
+; RV32-NEXT:    mulhu a4, a1, a4
+; RV32-NEXT:    mulh a2, a5, a2
+; RV32-NEXT:    add t0, t1, t0
+; RV32-NEXT:    sub a5, t4, a0
+; RV32-NEXT:    sub a7, t3, a7
+; RV32-NEXT:    sltu t1, t0, t1
 ; RV32-NEXT:    sub a0, t0, a0
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    add a0, a0, a5
-; RV32-NEXT:    add a0, a4, a0
-; RV32-NEXT:    add a0, a0, t5
-; RV32-NEXT:    srai a3, a3, 31
-; RV32-NEXT:    xor a0, a0, a3
-; RV32-NEXT:    xor a1, t4, a3
-; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    sltu t0, a5, a3
+; RV32-NEXT:    add a2, a7, a2
+; RV32-NEXT:    add t1, t2, t1
+; RV32-NEXT:    sltu a3, a0, a3
+; RV32-NEXT:    add a2, a2, t0
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    add a3, t3, a3
+; RV32-NEXT:    add a3, t1, a3
+; RV32-NEXT:    sub a1, a3, a1
+; RV32-NEXT:    sltu a3, a3, t1
+; RV32-NEXT:    add a5, a1, a5
+; RV32-NEXT:    sltu a6, a1, a6
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    sltu a1, a5, a1
+; RV32-NEXT:    add a3, a3, a6
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    xor a1, a1, a0
+; RV32-NEXT:    xor a0, a5, a0
+; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    beqz a0, .LBB62_2
 ; RV32-NEXT:  # %bb.1: # %overflow
 ; RV32-NEXT:    li a0, 0
@@ -5217,43 +5151,43 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; RV32ZBA-LABEL: smulo2.br.i64:
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    li a2, -13
-; RV32ZBA-NEXT:    mulhu a3, a0, a2
-; RV32ZBA-NEXT:    mul a4, a1, a2
-; RV32ZBA-NEXT:    add a3, a4, a3
-; RV32ZBA-NEXT:    sltu a4, a3, a4
-; RV32ZBA-NEXT:    mulhu a5, a1, a2
-; RV32ZBA-NEXT:    add a4, a5, a4
-; RV32ZBA-NEXT:    sub a3, a3, a0
-; RV32ZBA-NEXT:    neg a5, a0
-; RV32ZBA-NEXT:    sltu a6, a3, a5
-; RV32ZBA-NEXT:    li a7, -1
-; RV32ZBA-NEXT:    mulhu t0, a0, a7
-; RV32ZBA-NEXT:    add a6, t0, a6
-; RV32ZBA-NEXT:    add a6, a4, a6
-; RV32ZBA-NEXT:    sub t1, a6, a1
-; RV32ZBA-NEXT:    srai t2, a1, 31
-; RV32ZBA-NEXT:    mul t3, t2, a2
-; RV32ZBA-NEXT:    sub t3, t3, a0
-; RV32ZBA-NEXT:    add t4, t1, t3
-; RV32ZBA-NEXT:    sltu t5, t4, t1
-; RV32ZBA-NEXT:    neg t6, a1
-; RV32ZBA-NEXT:    sltu t1, t1, t6
-; RV32ZBA-NEXT:    sltu a4, a6, a4
-; RV32ZBA-NEXT:    mulhu a6, a1, a7
-; RV32ZBA-NEXT:    add a4, a6, a4
-; RV32ZBA-NEXT:    add a4, a4, t1
-; RV32ZBA-NEXT:    sltu a5, t3, a5
-; RV32ZBA-NEXT:    mulh a2, t2, a2
-; RV32ZBA-NEXT:    add a0, a0, a1
+; RV32ZBA-NEXT:    neg a3, a0
+; RV32ZBA-NEXT:    li a4, -1
+; RV32ZBA-NEXT:    srai a5, a1, 31
+; RV32ZBA-NEXT:    neg a6, a1
+; RV32ZBA-NEXT:    add a7, a0, a1
+; RV32ZBA-NEXT:    mulhu t0, a0, a2
+; RV32ZBA-NEXT:    mul t1, a1, a2
+; RV32ZBA-NEXT:    mulhu t2, a1, a2
+; RV32ZBA-NEXT:    mulhu t3, a0, a4
+; RV32ZBA-NEXT:    mul t4, a5, a2
+; RV32ZBA-NEXT:    mulhu a4, a1, a4
+; RV32ZBA-NEXT:    mulh a2, a5, a2
+; RV32ZBA-NEXT:    add t0, t1, t0
+; RV32ZBA-NEXT:    sub a5, t4, a0
+; RV32ZBA-NEXT:    sub a7, t3, a7
+; RV32ZBA-NEXT:    sltu t1, t0, t1
 ; RV32ZBA-NEXT:    sub a0, t0, a0
-; RV32ZBA-NEXT:    add a0, a0, a2
-; RV32ZBA-NEXT:    add a0, a0, a5
-; RV32ZBA-NEXT:    add a0, a4, a0
-; RV32ZBA-NEXT:    add a0, a0, t5
-; RV32ZBA-NEXT:    srai a3, a3, 31
-; RV32ZBA-NEXT:    xor a0, a0, a3
-; RV32ZBA-NEXT:    xor a1, t4, a3
-; RV32ZBA-NEXT:    or a0, a1, a0
+; RV32ZBA-NEXT:    sltu t0, a5, a3
+; RV32ZBA-NEXT:    add a2, a7, a2
+; RV32ZBA-NEXT:    add t1, t2, t1
+; RV32ZBA-NEXT:    sltu a3, a0, a3
+; RV32ZBA-NEXT:    add a2, a2, t0
+; RV32ZBA-NEXT:    srai a0, a0, 31
+; RV32ZBA-NEXT:    add a3, t3, a3
+; RV32ZBA-NEXT:    add a3, t1, a3
+; RV32ZBA-NEXT:    sub a1, a3, a1
+; RV32ZBA-NEXT:    sltu a3, a3, t1
+; RV32ZBA-NEXT:    add a5, a1, a5
+; RV32ZBA-NEXT:    sltu a6, a1, a6
+; RV32ZBA-NEXT:    add a3, a4, a3
+; RV32ZBA-NEXT:    sltu a1, a5, a1
+; RV32ZBA-NEXT:    add a3, a3, a6
+; RV32ZBA-NEXT:    add a2, a3, a2
+; RV32ZBA-NEXT:    add a1, a2, a1
+; RV32ZBA-NEXT:    xor a1, a1, a0
+; RV32ZBA-NEXT:    xor a0, a5, a0
+; RV32ZBA-NEXT:    or a0, a0, a1
 ; RV32ZBA-NEXT:    beqz a0, .LBB62_2
 ; RV32ZBA-NEXT:  # %bb.1: # %overflow
 ; RV32ZBA-NEXT:    li a0, 0
@@ -5279,43 +5213,43 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; RV32ZICOND-LABEL: smulo2.br.i64:
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    li a2, -13
-; RV32ZICOND-NEXT:    mulhu a3, a0, a2
-; RV32ZICOND-NEXT:    mul a4, a1, a2
-; RV32ZICOND-NEXT:    add a3, a4, a3
-; RV32ZICOND-NEXT:    sltu a4, a3, a4
-; RV32ZICOND-NEXT:    mulhu a5, a1, a2
-; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    sub a3, a3, a0
-; RV32ZICOND-NEXT:    neg a5, a0
-; RV32ZICOND-NEXT:    sltu a6, a3, a5
-; RV32ZICOND-NEXT:    li a7, -1
-; RV32ZICOND-NEXT:    mulhu t0, a0, a7
-; RV32ZICOND-NEXT:    add a6, t0, a6
-; RV32ZICOND-NEXT:    add a6, a4, a6
-; RV32ZICOND-NEXT:    sub t1, a6, a1
-; RV32ZICOND-NEXT:    srai t2, a1, 31
-; RV32ZICOND-NEXT:    mul t3, t2, a2
-; RV32ZICOND-NEXT:    sub t3, t3, a0
-; RV32ZICOND-NEXT:    add t4, t1, t3
-; RV32ZICOND-NEXT:    sltu t5, t4, t1
-; RV32ZICOND-NEXT:    neg t6, a1
-; RV32ZICOND-NEXT:    sltu t1, t1, t6
-; RV32ZICOND-NEXT:    sltu a4, a6, a4
-; RV32ZICOND-NEXT:    mulhu a6, a1, a7
-; RV32ZICOND-NEXT:    add a4, a6, a4
-; RV32ZICOND-NEXT:    add a4, a4, t1
-; RV32ZICOND-NEXT:    sltu a5, t3, a5
-; RV32ZICOND-NEXT:    mulh a2, t2, a2
-; RV32ZICOND-NEXT:    add a0, a0, a1
+; RV32ZICOND-NEXT:    neg a3, a0
+; RV32ZICOND-NEXT:    li a4, -1
+; RV32ZICOND-NEXT:    srai a5, a1, 31
+; RV32ZICOND-NEXT:    neg a6, a1
+; RV32ZICOND-NEXT:    add a7, a0, a1
+; RV32ZICOND-NEXT:    mulhu t0, a0, a2
+; RV32ZICOND-NEXT:    mul t1, a1, a2
+; RV32ZICOND-NEXT:    mulhu t2, a1, a2
+; RV32ZICOND-NEXT:    mulhu t3, a0, a4
+; RV32ZICOND-NEXT:    mul t4, a5, a2
+; RV32ZICOND-NEXT:    mulhu a4, a1, a4
+; RV32ZICOND-NEXT:    mulh a2, a5, a2
+; RV32ZICOND-NEXT:    add t0, t1, t0
+; RV32ZICOND-NEXT:    sub a5, t4, a0
+; RV32ZICOND-NEXT:    sub a7, t3, a7
+; RV32ZICOND-NEXT:    sltu t1, t0, t1
 ; RV32ZICOND-NEXT:    sub a0, t0, a0
-; RV32ZICOND-NEXT:    add a0, a0, a2
-; RV32ZICOND-NEXT:    add a0, a0, a5
-; RV32ZICOND-NEXT:    add a0, a4, a0
-; RV32ZICOND-NEXT:    add a0, a0, t5
-; RV32ZICOND-NEXT:    srai a3, a3, 31
-; RV32ZICOND-NEXT:    xor a0, a0, a3
-; RV32ZICOND-NEXT:    xor a1, t4, a3
-; RV32ZICOND-NEXT:    or a0, a1, a0
+; RV32ZICOND-NEXT:    sltu t0, a5, a3
+; RV32ZICOND-NEXT:    add a2, a7, a2
+; RV32ZICOND-NEXT:    add t1, t2, t1
+; RV32ZICOND-NEXT:    sltu a3, a0, a3
+; RV32ZICOND-NEXT:    add a2, a2, t0
+; RV32ZICOND-NEXT:    srai a0, a0, 31
+; RV32ZICOND-NEXT:    add a3, t3, a3
+; RV32ZICOND-NEXT:    add a3, t1, a3
+; RV32ZICOND-NEXT:    sub a1, a3, a1
+; RV32ZICOND-NEXT:    sltu a3, a3, t1
+; RV32ZICOND-NEXT:    add a5, a1, a5
+; RV32ZICOND-NEXT:    sltu a6, a1, a6
+; RV32ZICOND-NEXT:    add a3, a4, a3
+; RV32ZICOND-NEXT:    sltu a1, a5, a1
+; RV32ZICOND-NEXT:    add a3, a3, a6
+; RV32ZICOND-NEXT:    add a2, a3, a2
+; RV32ZICOND-NEXT:    add a1, a2, a1
+; RV32ZICOND-NEXT:    xor a1, a1, a0
+; RV32ZICOND-NEXT:    xor a0, a5, a0
+; RV32ZICOND-NEXT:    or a0, a0, a1
 ; RV32ZICOND-NEXT:    beqz a0, .LBB62_2
 ; RV32ZICOND-NEXT:  # %bb.1: # %overflow
 ; RV32ZICOND-NEXT:    li a0, 0
@@ -5443,20 +5377,20 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    mul a4, a3, a0
 ; RV32-NEXT:    mul a5, a1, a2
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    mulhu a5, a0, a2
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    sltu a4, a4, a5
-; RV32-NEXT:    snez a5, a3
-; RV32-NEXT:    snez a6, a1
-; RV32-NEXT:    and a5, a6, a5
-; RV32-NEXT:    mulhu a1, a1, a2
-; RV32-NEXT:    snez a1, a1
-; RV32-NEXT:    or a1, a5, a1
+; RV32-NEXT:    mulhu a6, a0, a2
 ; RV32-NEXT:    mulhu a0, a3, a0
+; RV32-NEXT:    snez a3, a3
+; RV32-NEXT:    mulhu a2, a1, a2
+; RV32-NEXT:    snez a1, a1
+; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    and a1, a1, a3
+; RV32-NEXT:    snez a2, a2
 ; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    add a4, a6, a4
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    sltu a2, a4, a6
 ; RV32-NEXT:    or a0, a1, a0
-; RV32-NEXT:    or a0, a0, a4
+; RV32-NEXT:    or a0, a0, a2
 ; RV32-NEXT:    beqz a0, .LBB64_2
 ; RV32-NEXT:  # %bb.1: # %overflow
 ; RV32-NEXT:    li a0, 0
@@ -5480,20 +5414,20 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    mul a4, a3, a0
 ; RV32ZBA-NEXT:    mul a5, a1, a2
-; RV32ZBA-NEXT:    add a4, a5, a4
-; RV32ZBA-NEXT:    mulhu a5, a0, a2
-; RV32ZBA-NEXT:    add a4, a5, a4
-; RV32ZBA-NEXT:    sltu a4, a4, a5
-; RV32ZBA-NEXT:    snez a5, a3
-; RV32ZBA-NEXT:    snez a6, a1
-; RV32ZBA-NEXT:    and a5, a6, a5
-; RV32ZBA-NEXT:    mulhu a1, a1, a2
-; RV32ZBA-NEXT:    snez a1, a1
-; RV32ZBA-NEXT:    or a1, a5, a1
+; RV32ZBA-NEXT:    mulhu a6, a0, a2
 ; RV32ZBA-NEXT:    mulhu a0, a3, a0
+; RV32ZBA-NEXT:    snez a3, a3
+; RV32ZBA-NEXT:    mulhu a2, a1, a2
+; RV32ZBA-NEXT:    snez a1, a1
+; RV32ZBA-NEXT:    add a4, a5, a4
+; RV32ZBA-NEXT:    and a1, a1, a3
+; RV32ZBA-NEXT:    snez a2, a2
 ; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    add a4, a6, a4
+; RV32ZBA-NEXT:    or a1, a1, a2
+; RV32ZBA-NEXT:    sltu a2, a4, a6
 ; RV32ZBA-NEXT:    or a0, a1, a0
-; RV32ZBA-NEXT:    or a0, a0, a4
+; RV32ZBA-NEXT:    or a0, a0, a2
 ; RV32ZBA-NEXT:    beqz a0, .LBB64_2
 ; RV32ZBA-NEXT:  # %bb.1: # %overflow
 ; RV32ZBA-NEXT:    li a0, 0
@@ -5517,20 +5451,20 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    mul a4, a3, a0
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
-; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    mulhu a5, a0, a2
-; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    sltu a4, a4, a5
-; RV32ZICOND-NEXT:    snez a5, a3
-; RV32ZICOND-NEXT:    snez a6, a1
-; RV32ZICOND-NEXT:    and a5, a6, a5
-; RV32ZICOND-NEXT:    mulhu a1, a1, a2
-; RV32ZICOND-NEXT:    snez a1, a1
-; RV32ZICOND-NEXT:    or a1, a5, a1
+; RV32ZICOND-NEXT:    mulhu a6, a0, a2
 ; RV32ZICOND-NEXT:    mulhu a0, a3, a0
+; RV32ZICOND-NEXT:    snez a3, a3
+; RV32ZICOND-NEXT:    mulhu a2, a1, a2
+; RV32ZICOND-NEXT:    snez a1, a1
+; RV32ZICOND-NEXT:    add a4, a5, a4
+; RV32ZICOND-NEXT:    and a1, a1, a3
+; RV32ZICOND-NEXT:    snez a2, a2
 ; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    add a4, a6, a4
+; RV32ZICOND-NEXT:    or a1, a1, a2
+; RV32ZICOND-NEXT:    sltu a2, a4, a6
 ; RV32ZICOND-NEXT:    or a0, a1, a0
-; RV32ZICOND-NEXT:    or a0, a0, a4
+; RV32ZICOND-NEXT:    or a0, a0, a2
 ; RV32ZICOND-NEXT:    beqz a0, .LBB64_2
 ; RV32ZICOND-NEXT:  # %bb.1: # %overflow
 ; RV32ZICOND-NEXT:    li a0, 0
@@ -5624,13 +5558,13 @@ define zeroext i1 @umulo2.br.i64(i64 %v1) {
 ; RV32ZICOND-LABEL: umulo2.br.i64:
 ; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    add a2, a0, a0
+; RV32ZICOND-NEXT:    add a3, a1, a1
 ; RV32ZICOND-NEXT:    sltu a0, a2, a0
-; RV32ZICOND-NEXT:    add a2, a1, a1
-; RV32ZICOND-NEXT:    add a2, a2, a0
-; RV32ZICOND-NEXT:    xor a3, a2, a1
-; RV32ZICOND-NEXT:    sltu a1, a2, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, a3
-; RV32ZICOND-NEXT:    czero.nez a0, a0, a3
+; RV32ZICOND-NEXT:    add a3, a3, a0
+; RV32ZICOND-NEXT:    xor a2, a3, a1
+; RV32ZICOND-NEXT:    sltu a1, a3, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, a2
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
 ; RV32ZICOND-NEXT:    or a0, a0, a1
 ; RV32ZICOND-NEXT:    beqz a0, .LBB65_2
 ; RV32ZICOND-NEXT:  # %bb.1: # %overflow

diff  --git a/llvm/test/CodeGen/RISCV/xtheadmac.ll b/llvm/test/CodeGen/RISCV/xtheadmac.ll
index 992c88e3e62685..78d18101979b3e 100644
--- a/llvm/test/CodeGen/RISCV/xtheadmac.ll
+++ b/llvm/test/CodeGen/RISCV/xtheadmac.ll
@@ -39,13 +39,13 @@ define i64 @mula_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32XTHEADMAC:       # %bb.0:
 ; RV32XTHEADMAC-NEXT:    mulhu a6, a2, a4
 ; RV32XTHEADMAC-NEXT:    th.mula a6, a2, a5
+; RV32XTHEADMAC-NEXT:    mv a5, a0
+; RV32XTHEADMAC-NEXT:    th.mula a5, a2, a4
 ; RV32XTHEADMAC-NEXT:    th.mula a6, a3, a4
-; RV32XTHEADMAC-NEXT:    mv a3, a0
-; RV32XTHEADMAC-NEXT:    th.mula a3, a2, a4
-; RV32XTHEADMAC-NEXT:    sltu a0, a3, a0
+; RV32XTHEADMAC-NEXT:    sltu a0, a5, a0
 ; RV32XTHEADMAC-NEXT:    add a0, a1, a0
 ; RV32XTHEADMAC-NEXT:    add a1, a0, a6
-; RV32XTHEADMAC-NEXT:    mv a0, a3
+; RV32XTHEADMAC-NEXT:    mv a0, a5
 ; RV32XTHEADMAC-NEXT:    ret
 ;
 ; RV64XTHEADMAC-LABEL: mula_i64:
@@ -98,11 +98,11 @@ define i64 @muls_i64(i64 %a, i64 %b, i64 %c) {
 ; RV32XTHEADMAC:       # %bb.0:
 ; RV32XTHEADMAC-NEXT:    mulhu a6, a2, a4
 ; RV32XTHEADMAC-NEXT:    th.mula a6, a2, a5
-; RV32XTHEADMAC-NEXT:    th.mula a6, a3, a4
-; RV32XTHEADMAC-NEXT:    mul a3, a2, a4
-; RV32XTHEADMAC-NEXT:    sltu a3, a0, a3
+; RV32XTHEADMAC-NEXT:    mul a5, a2, a4
+; RV32XTHEADMAC-NEXT:    sltu a5, a0, a5
 ; RV32XTHEADMAC-NEXT:    th.muls a0, a2, a4
-; RV32XTHEADMAC-NEXT:    sub a1, a1, a3
+; RV32XTHEADMAC-NEXT:    th.mula a6, a3, a4
+; RV32XTHEADMAC-NEXT:    sub a1, a1, a5
 ; RV32XTHEADMAC-NEXT:    sub a1, a1, a6
 ; RV32XTHEADMAC-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
index 46aa383866e93a..e761fcb736a87c 100644
--- a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
+++ b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll
@@ -1045,8 +1045,8 @@ define void @srd(ptr %a, i64 %b, i64 %c) {
 ; RV32XTHEADMEMIDX-LABEL: srd:
 ; RV32XTHEADMEMIDX:       # %bb.0:
 ; RV32XTHEADMEMIDX-NEXT:    add a2, a3, a3
-; RV32XTHEADMEMIDX-NEXT:    sltu a3, a2, a3
 ; RV32XTHEADMEMIDX-NEXT:    add a4, a4, a4
+; RV32XTHEADMEMIDX-NEXT:    sltu a3, a2, a3
 ; RV32XTHEADMEMIDX-NEXT:    add a3, a4, a3
 ; RV32XTHEADMEMIDX-NEXT:    slli a4, a1, 3
 ; RV32XTHEADMEMIDX-NEXT:    add a4, a0, a4
@@ -1069,8 +1069,8 @@ define void @surd(ptr %a, i32 %b, i64 %c) {
 ; RV32XTHEADMEMIDX-LABEL: surd:
 ; RV32XTHEADMEMIDX:       # %bb.0:
 ; RV32XTHEADMEMIDX-NEXT:    add a4, a2, a2
-; RV32XTHEADMEMIDX-NEXT:    sltu a2, a4, a2
 ; RV32XTHEADMEMIDX-NEXT:    add a3, a3, a3
+; RV32XTHEADMEMIDX-NEXT:    sltu a2, a4, a2
 ; RV32XTHEADMEMIDX-NEXT:    add a2, a3, a2
 ; RV32XTHEADMEMIDX-NEXT:    slli a3, a1, 3
 ; RV32XTHEADMEMIDX-NEXT:    add a3, a0, a3

diff  --git a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll
index a4f56b6d28409c..9a312d9daca8db 100644
--- a/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll
+++ b/llvm/test/CodeGen/RISCV/zdinx-boundary-check.ll
@@ -39,9 +39,9 @@ define void @foo2(ptr nocapture %p, double %d) nounwind {
 ; RV32ZDINX-LABEL: foo2:
 ; RV32ZDINX:       # %bb.0: # %entry
 ; RV32ZDINX-NEXT:    mv a3, a2
+; RV32ZDINX-NEXT:    addi a0, a0, 2047
 ; RV32ZDINX-NEXT:    mv a2, a1
 ; RV32ZDINX-NEXT:    fadd.d a2, a2, a2
-; RV32ZDINX-NEXT:    addi a0, a0, 2047
 ; RV32ZDINX-NEXT:    sw a2, -3(a0)
 ; RV32ZDINX-NEXT:    sw a3, 1(a0)
 ; RV32ZDINX-NEXT:    ret
@@ -49,9 +49,9 @@ define void @foo2(ptr nocapture %p, double %d) nounwind {
 ; RV32ZDINXUALIGNED-LABEL: foo2:
 ; RV32ZDINXUALIGNED:       # %bb.0: # %entry
 ; RV32ZDINXUALIGNED-NEXT:    mv a3, a2
+; RV32ZDINXUALIGNED-NEXT:    addi a0, a0, 2047
 ; RV32ZDINXUALIGNED-NEXT:    mv a2, a1
 ; RV32ZDINXUALIGNED-NEXT:    fadd.d a2, a2, a2
-; RV32ZDINXUALIGNED-NEXT:    addi a0, a0, 2047
 ; RV32ZDINXUALIGNED-NEXT:    sw a2, -3(a0)
 ; RV32ZDINXUALIGNED-NEXT:    sw a3, 1(a0)
 ; RV32ZDINXUALIGNED-NEXT:    ret
@@ -183,10 +183,10 @@ entry:
 define void @foo6(ptr %p, double %d) nounwind {
 ; RV32ZDINX-LABEL: foo6:
 ; RV32ZDINX:       # %bb.0: # %entry
-; RV32ZDINX-NEXT:    lui a3, %hi(.LCPI5_0)
-; RV32ZDINX-NEXT:    lw a4, %lo(.LCPI5_0)(a3)
-; RV32ZDINX-NEXT:    lw a5, %lo(.LCPI5_0+4)(a3)
 ; RV32ZDINX-NEXT:    mv a3, a2
+; RV32ZDINX-NEXT:    lui a2, %hi(.LCPI5_0)
+; RV32ZDINX-NEXT:    lw a4, %lo(.LCPI5_0)(a2)
+; RV32ZDINX-NEXT:    lw a5, %lo(.LCPI5_0+4)(a2)
 ; RV32ZDINX-NEXT:    mv a2, a1
 ; RV32ZDINX-NEXT:    fadd.d a2, a2, a4
 ; RV32ZDINX-NEXT:    addi a0, a0, 2047
@@ -196,10 +196,10 @@ define void @foo6(ptr %p, double %d) nounwind {
 ;
 ; RV32ZDINXUALIGNED-LABEL: foo6:
 ; RV32ZDINXUALIGNED:       # %bb.0: # %entry
-; RV32ZDINXUALIGNED-NEXT:    lui a3, %hi(.LCPI5_0)
-; RV32ZDINXUALIGNED-NEXT:    lw a4, %lo(.LCPI5_0)(a3)
-; RV32ZDINXUALIGNED-NEXT:    lw a5, %lo(.LCPI5_0+4)(a3)
 ; RV32ZDINXUALIGNED-NEXT:    mv a3, a2
+; RV32ZDINXUALIGNED-NEXT:    lui a2, %hi(.LCPI5_0)
+; RV32ZDINXUALIGNED-NEXT:    lw a4, %lo(.LCPI5_0)(a2)
+; RV32ZDINXUALIGNED-NEXT:    lw a5, %lo(.LCPI5_0+4)(a2)
 ; RV32ZDINXUALIGNED-NEXT:    mv a2, a1
 ; RV32ZDINXUALIGNED-NEXT:    fadd.d a2, a2, a4
 ; RV32ZDINXUALIGNED-NEXT:    addi a0, a0, 2047


        


More information about the llvm-commits mailing list